Source code for GetDNA
# -*- coding: utf-8 -*-
# Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
# All rights reserved.
# This file is part of the PyBioMed.
# The contents are covered by the terms of the BSD license
# which is included in the file license.txt, found at the root
# of the PyBioMed source tree.
"""
This module is used for downloading the DNA sequence from ncbi web. You can only
need input a DNA ID.
Authors: Zhijiang Yao and Dongsheng Cao.
Date: 2016.11.04
Email: gadsby@163.com
"""
import urllib
import sys
ALPHABET = 'ACGT'
[docs]class Seq:
def __init__(self, name, seq, no):
self.name = name
self.seq = seq.upper()
self.no = no
self.length = len(seq)
def __str__(self):
"""Output seq when 'print' method is called."""
return "%s\tNo:%s\tlength:%s\n%s" % (self.name, str(self.no), str(self.length), self.seq)
[docs]def GetDNAFromUniGene(SeqID = ''):
'''
This module is used for downloading the DNA sequence from ncbi web. You can only
need input a DNA ID.
'''
url = 'http://www.ebi.ac.uk/ena/data/view/{0}&display=fasta'.format(SeqID)
temp = urllib.urlopen(url).read()
return temp
[docs]def IsUnderAlphabet(s, alphabet):
"""
#################################################################
Judge the string is within the scope of the alphabet or not.
:param s: The string.
:param alphabet: alphabet.
Return True or the error character.
#################################################################
"""
for e in s:
if e not in alphabet:
return e
return True
[docs]def IsFasta(seq):
"""
#################################################################
Judge the Seq object is in FASTA format.
Two situation:
1. No seq name.
2. Seq name is illegal.
3. No sequence.
:param seq: Seq object.
#################################################################
"""
if not seq.name:
error_info = 'Error, sequence ' + str(seq.no) + ' has no sequence name.'
print(seq)
sys.stderr.write(error_info)
return False
if -1 != seq.name.find('>'):
error_info = 'Error, sequence ' + str(seq.no) + ' name has > character.'
sys.stderr.write(error_info)
return False
if 0 == seq.length:
error_info = 'Error, sequence ' + str(seq.no) + ' is null.'
sys.stderr.write(error_info)
return False
return True
[docs]def ReadFasta(f):
"""
#################################################################
Read a fasta file.
:param f: HANDLE to input. e.g. sys.stdin, or open(<file>)
Return Seq obj list.
#################################################################
"""
name, seq = '', ''
count = 0
seq_list = []
lines = f.readlines()
for line in lines:
if not line:
break
if '>' == line[0]:
if 0 != count or (0 == count and seq != ''):
if IsFasta(Seq(name, seq, count)):
seq_list.append(seq)
else:
sys.exit(0)
seq = ''
name = line[1:].strip()
count += 1
else:
seq += line.strip()
count += 1
if IsFasta(Seq(name, seq, count)):
seq_list.append(seq)
else:
sys.exit(0)
return seq_list
if __name__ == '__main__':
print '-'*10+'START'+'-'*10
print 'Only PyBioMed is successfully installed the code below can be run!'
from PyBioMed.PyGetMol.GetProtein import timelimited
@timelimited(10)
def run_GetDNAFromUniGene():
seqid = 'AA954964'
seqid2 = 'CB216422'
print GetDNAFromUniGene(seqid)
@timelimited(10)
def run_ReadFasta():
dna = ReadFasta(open('../test/test_data/example.fasta'))
print dna
run_GetDNAFromUniGene()
print '-'*25
run_ReadFasta()
print '-'*10+'END'+'-'*10