Source code for GetSubSeq

# -*- coding: utf-8 -*-
#  Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
#  All rights reserved.
#  This file is part of the PyBioMed.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the PyBioMed source tree.
"""
#####################################################################################

The prediction of functional sites (e.g.,methylation) of proteins usually needs to 

split the total protein into a set of segments around specific amino acid. Given a 

specific window size p, we can obtain all segments of length equal to (2*p+1) very 

easily. Note that the output of the method is a list form. You can freely use and 

distribute it. If you have any problem, you could contact with us timely.

Authors: Zhijiang Yao and Dongsheng Cao.

Date: 2016.06.04

Email: gadsby@163.com

#####################################################################################

"""

import re
import string

AALetter = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]


#############################################################################################

[docs]def GetSubSequence(ProteinSequence, ToAA='S', window=3): """ ####################################################################### Get all 2*window+1 sub-sequences whose cener is ToAA in a protein. Usage: result=GetSubSequence(protein,ToAA,window) Input:protein is a pure problem sequence. ToAA is the central (query point) amino acid in the sub-sequence. window is the span. result is a list form containing all satisfied sub-sequences. ####################################################################### """ if ToAA not in AALetter: ToAA = ProteinSequence[1] Num = len(ProteinSequence) seqiter = re.finditer(ToAA, ProteinSequence) AAindex = [] for i in seqiter: AAindex.append(i.end()) result = [] for i in AAindex: if i - window > 0 and Num - i + 1 - window > 0: temp = ProteinSequence[i - window - 1:i + window] result.append(temp) return result
############################################################################################# if __name__ == "__main__": protein = "ADGCGVGEGTGQGPMCNCMCMKWVYADEDAADLESDSFADEDASLESDSFPWSNQRVFCSFADEDAS" subseq = GetSubSequence(protein, ToAA='D', window=10) print subseq print len(subseq)