Source code for GetProteinFromUniprot

# -*- coding: utf-8 -*-
#  Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
#  All rights reserved.
#  This file is part of the PyBioMed.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the PyBioMed source tree.
"""
################################################################################################

This module is used to download the protein sequence from the uniprot (http://www.uniprot.org/) 

website. You can only need input a protein ID or prepare a file (ID.txt) related to ID. You can

 obtain a .txt (ProteinSequence.txt) file saving protein sequence you need.  You can freely use 
 
 and distribute it. If you hava  any problem, you could contact with us timely!
 
Authors: Zhijiang Yao and Dongsheng Cao.

Date: 2016.06.04

Email: gadsby@163.com

################################################################################################
"""

import urllib
import string


##################################################################################################
[docs]def GetProteinSequence(ProteinID):
    """
    #########################################################################################
    Get the protein sequence from the uniprot website by ID.

    Usage:

    result=GetProteinSequence(ProteinID)

    Input: ProteinID is a string indicating ID such as "P48039".

    Output: result is a protein sequence.
    #########################################################################################
    """

    ID = str(ProteinID)
    localfile = urllib.urlopen('http://www.uniprot.org/uniprot/' + ID + '.fasta')
    temp = localfile.readlines()
    res = ''
    for i in range(1, len(temp)):
        res = res + string.strip(temp[i])
    return res


##################################################################################################
[docs]def GetProteinSequenceFromTxt(path, openfile, savefile):
    """
    #########################################################################################
    Get the protein sequence from the uniprot website by the file containing ID.

    Usage:

    result=GetProteinSequenceFromTxt(path,openfile,savefile)

    Input: path is a directory path containing the ID file such as "/home/orient/protein/"

    openfile is the ID file such as "proteinID.txt"

    savefile is the file saving the obtained protein sequences such as "protein.txt"
    #########################################################################################
    """
    f1 = file(path + savefile, 'wb')
    f2 = file(path + openfile, 'r')
    #	res=[]
    for index, i in enumerate(f2):

        itrim = string.strip(i)
        if itrim == "":
            continue
        else:
            temp = GetProteinSequence(itrim)
            print "--------------------------------------------------------"
            print "The %d protein sequence has been downloaded!" % (index + 1)
            print temp
            f1.write(temp + '\n')
            print "--------------------------------------------------------"
        #		res.append(temp+'\n')
        #	f1.writelines(res)
    f2.close()
    f1.close()
    return 0


##################################################################################################
if __name__ == '__main__':

    localfile = ['P48039']
    for index, i in enumerate(localfile):
        itrim = string.strip(i)
        if itrim == "":
            continue
        else:
            temp = GetProteinSequence(itrim)
            print "--------------------------------------------------------"
            print "The %d protein sequence has been downloaded!" % (index + 1)
            print temp
            print "--------------------------------------------------------"