Source code for GetDNA

# -*- coding: utf-8 -*-
#  Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
#  All rights reserved.
#  This file is part of the PyBioMed.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the PyBioMed source tree.
"""

This module is used for downloading the DNA sequence from ncbi web. You can only 

need input a DNA ID.


Authors: Zhijiang Yao and Dongsheng Cao.

Date: 2016.11.04

Email: gadsby@163.com

"""


import urllib
import sys

ALPHABET = 'ACGT'

[docs]class Seq:
    def __init__(self, name, seq, no):
        self.name = name
        self.seq = seq.upper()
        self.no = no
        self.length = len(seq)

    def __str__(self):
        """Output seq when 'print' method is called."""
        return "%s\tNo:%s\tlength:%s\n%s" % (self.name, str(self.no), str(self.length), self.seq)

[docs]def GetDNAFromUniGene(SeqID = ''):
    
    '''
    This module is used for downloading the DNA sequence from ncbi web. You can only 

    need input a DNA ID.
    
    '''
    url = 'http://www.ebi.ac.uk/ena/data/view/{0}&display=fasta'.format(SeqID)
    temp = urllib.urlopen(url).read()    
    return temp

[docs]def IsUnderAlphabet(s, alphabet):
    """
    #################################################################
    Judge the string is within the scope of the alphabet or not.

    :param s: The string.
    :param alphabet: alphabet.

    Return True or the error character.
    #################################################################
    """
    for e in s:
        if e not in alphabet:
            return e

    return True	

[docs]def IsFasta(seq):
    """
    #################################################################
    Judge the Seq object is in FASTA format.
    Two situation:
    1. No seq name.
    2. Seq name is illegal.
    3. No sequence.

    :param seq: Seq object.
    #################################################################
    """
    if not seq.name:
        error_info = 'Error, sequence ' + str(seq.no) + ' has no sequence name.'
        print(seq)
        sys.stderr.write(error_info)
        return False
    if -1 != seq.name.find('>'):
        error_info = 'Error, sequence ' + str(seq.no) + ' name has > character.'
        sys.stderr.write(error_info)
        return False
    if 0 == seq.length:
        error_info = 'Error, sequence ' + str(seq.no) + ' is null.'
        sys.stderr.write(error_info)
        return False

    return True

[docs]def ReadFasta(f):
    """
    #################################################################
    Read a fasta file.

    :param f: HANDLE to input. e.g. sys.stdin, or open(<file>)

    Return Seq obj list.
    #################################################################
    """
    name, seq = '', ''
    count = 0
    seq_list = []
    lines = f.readlines()
    for line in lines:
        if not line:
            break

        if '>' == line[0]:
            if 0 != count or (0 == count and seq != ''):
                if IsFasta(Seq(name, seq, count)):
                    seq_list.append(seq)
                else:
                    sys.exit(0)

            seq = ''
            name = line[1:].strip()
            count += 1
        else:
            seq += line.strip()

    count += 1
    if IsFasta(Seq(name, seq, count)):
        seq_list.append(seq)
    else:
        sys.exit(0)

    return seq_list


if __name__ == '__main__':
    print '-'*10+'START'+'-'*10
    print 'Only PyBioMed is successfully installed the code below can be run！'
    from PyBioMed.PyGetMol.GetProtein import timelimited
    @timelimited(10)
    def run_GetDNAFromUniGene():
        seqid = 'AA954964'
        seqid2 = 'CB216422'
        print GetDNAFromUniGene(seqid)

    @timelimited(10)
    def run_ReadFasta():

        dna = ReadFasta(open('../test/test_data/example.fasta'))
        print dna

    run_GetDNAFromUniGene()
    print '-'*25
    run_ReadFasta()
    print '-'*10+'END'+'-'*10