Source code for GetSubSeq
# -*- coding: utf-8 -*-
# Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
# All rights reserved.
# This file is part of the PyBioMed.
# The contents are covered by the terms of the BSD license
# which is included in the file license.txt, found at the root
# of the PyBioMed source tree.
"""
#####################################################################################
The prediction of functional sites (e.g.,methylation) of proteins usually needs to
split the total protein into a set of segments around specific amino acid. Given a
specific window size p, we can obtain all segments of length equal to (2*p+1) very
easily. Note that the output of the method is a list form. You can freely use and
distribute it. If you have any problem, you could contact with us timely.
Authors: Zhijiang Yao and Dongsheng Cao.
Date: 2016.06.04
Email: gadsby@163.com
#####################################################################################
"""
import re
import string
AALetter = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
#############################################################################################
[docs]def GetSubSequence(ProteinSequence, ToAA='S', window=3):
"""
#######################################################################
Get all 2*window+1 sub-sequences whose cener is ToAA in a protein.
Usage:
result=GetSubSequence(protein,ToAA,window)
Input:protein is a pure problem sequence.
ToAA is the central (query point) amino acid in the sub-sequence.
window is the span.
result is a list form containing all satisfied sub-sequences.
#######################################################################
"""
if ToAA not in AALetter:
ToAA = ProteinSequence[1]
Num = len(ProteinSequence)
seqiter = re.finditer(ToAA, ProteinSequence)
AAindex = []
for i in seqiter:
AAindex.append(i.end())
result = []
for i in AAindex:
if i - window > 0 and Num - i + 1 - window > 0:
temp = ProteinSequence[i - window - 1:i + window]
result.append(temp)
return result
#############################################################################################
if __name__ == "__main__":
protein = "ADGCGVGEGTGQGPMCNCMCMKWVYADEDAADLESDSFADEDASLESDSFPWSNQRVFCSFADEDAS"
subseq = GetSubSequence(protein, ToAA='D', window=10)
print subseq
print len(subseq)