# -*- coding: utf-8 -*-
# Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
# All rights reserved.
# This file is part of the PyBioMed.
# The contents are covered by the terms of the BSD license
# which is included in the file license.txt, found at the root
# of the PyBioMed source tree.
'''
#####################################################################################
This module is used for computing the composition, transition and distribution
descriptors based on the different properties of AADs. The AADs with the same
properties is marked as the same number. You can get 147 descriptors for a given
protein sequence. You can freely use and distribute it. If you hava any problem,
you could contact with us timely!
References:
[1]: Inna Dubchak, Ilya Muchink, Stephen R.Holbrook and Sung-Hou Kim. Prediction
of protein folding class using global description of amino acid sequence. Proc.Natl.
Acad.Sci.USA, 1995, 92, 8700-8704.
[2]:Inna Dubchak, Ilya Muchink, Christopher Mayor, Igor Dralyuk and Sung-Hou Kim.
Recognition of a Protein Fold in the Context of the SCOP classification. Proteins:
Structure, Function and Genetics,1999,35,401-407.
Authors: Zhijiang Yao and Dongsheng Cao.
Date: 2016.06.04
Email: gadsby@163.com
#####################################################################################
'''
import string, math, copy
AALetter = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
_Hydrophobicity = {'1': 'RKEDQN', '2': 'GASTPHY', '3': 'CLVIMFW'}
# '1'stand for Polar; '2'stand for Neutral, '3' stand for Hydrophobicity
_NormalizedVDWV = {'1': 'GASTPD', '2': 'NVEQIL', '3': 'MHKFRYW'}
# '1'stand for (0-2.78); '2'stand for (2.95-4.0), '3' stand for (4.03-8.08)
_Polarity = {'1': 'LIFWCMVY', '2': 'CPNVEQIL', '3': 'KMHFRYW'}
# '1'stand for (4.9-6.2); '2'stand for (8.0-9.2), '3' stand for (10.4-13.0)
_Charge = {'1': 'KR', '2': 'ANCQGHILMFPSTWYV', '3': 'DE'}
# '1'stand for Positive; '2'stand for Neutral, '3' stand for Negative
_SecondaryStr = {'1': 'EALMQKRH', '2': 'VIYCWFT', '3': 'GNPSD'}
# '1'stand for Helix; '2'stand for Strand, '3' stand for coil
_SolventAccessibility = {'1': 'ALFCGIVW', '2': 'RKQEND', '3': 'MPSTHY'}
# '1'stand for Buried; '2'stand for Exposed, '3' stand for Intermediate
_Polarizability = {'1': 'GASDT', '2': 'CPNVEQIL', '3': 'KMHFRYW'}
# '1'stand for (0-0.108); '2'stand for (0.128-0.186), '3' stand for (0.219-0.409)
##You can continuely add other properties of AADs to compute descriptors of protein sequence.
_AATProperty = (
_Hydrophobicity, _NormalizedVDWV, _Polarity, _Charge, _SecondaryStr, _SolventAccessibility, _Polarizability)
_AATPropertyName = (
'_Hydrophobicity', '_NormalizedVDWV', '_Polarity', '_Charge', '_SecondaryStr', '_SolventAccessibility',
'_Polarizability')
##################################################################################################
[docs]def StringtoNum(ProteinSequence, AAProperty):
"""
###############################################################################################
Tranform the protein sequence into the string form such as 32123223132121123.
Usage:
result=StringtoNum(protein,AAProperty)
Input: protein is a pure protein sequence.
AAProperty is a dict form containing classifciation of amino acids such as _Polarizability.
Output: result is a string such as 123321222132111123222
###############################################################################################
"""
hardProteinSequence = copy.deepcopy(ProteinSequence)
for k, m in AAProperty.items():
for index in m:
hardProteinSequence = string.replace(hardProteinSequence, index, k)
TProteinSequence = hardProteinSequence
return TProteinSequence
[docs]def CalculateComposition(ProteinSequence, AAProperty, AAPName):
"""
###############################################################################################
A method used for computing composition descriptors.
Usage:
result=CalculateComposition(protein,AAProperty,AAPName)
Input: protein is a pure protein sequence.
AAProperty is a dict form containing classifciation of amino acids such as _Polarizability.
AAPName is a string used for indicating a AAP name.
Output: result is a dict form containing composition descriptors based on the given property.
###############################################################################################
"""
TProteinSequence = StringtoNum(ProteinSequence, AAProperty)
Result = {}
Num = len(TProteinSequence)
Result[AAPName + 'C' + '1'] = round(float(TProteinSequence.count('1')) / Num, 3)
Result[AAPName + 'C' + '2'] = round(float(TProteinSequence.count('2')) / Num, 3)
Result[AAPName + 'C' + '3'] = round(float(TProteinSequence.count('3')) / Num, 3)
return Result
[docs]def CalculateTransition(ProteinSequence, AAProperty, AAPName):
"""
###############################################################################################
A method used for computing transition descriptors
Usage:
result=CalculateTransition(protein,AAProperty,AAPName)
Input:protein is a pure protein sequence.
AAProperty is a dict form containing classifciation of amino acids such as _Polarizability.
AAPName is a string used for indicating a AAP name.
Output:result is a dict form containing transition descriptors based on the given property.
###############################################################################################
"""
TProteinSequence = StringtoNum(ProteinSequence, AAProperty)
Result = {}
Num = len(TProteinSequence)
CTD = TProteinSequence
Result[AAPName + 'T' + '12'] = round(float(CTD.count('12') + CTD.count('21')) / (Num - 1), 3)
Result[AAPName + 'T' + '13'] = round(float(CTD.count('13') + CTD.count('31')) / (Num - 1), 3)
Result[AAPName + 'T' + '23'] = round(float(CTD.count('23') + CTD.count('32')) / (Num - 1), 3)
return Result
[docs]def CalculateDistribution(ProteinSequence, AAProperty, AAPName):
"""
###############################################################################################
A method used for computing distribution descriptors.
Usage:
result=CalculateDistribution(protein,AAProperty,AAPName)
Input:protein is a pure protein sequence.
AAProperty is a dict form containing classifciation of amino acids such as _Polarizability.
AAPName is a string used for indicating a AAP name.
Output:result is a dict form containing Distribution descriptors based on the given property.
###############################################################################################
"""
TProteinSequence = StringtoNum(ProteinSequence, AAProperty)
Result = {}
Num = len(TProteinSequence)
temp = ('1', '2', '3')
for i in temp:
num = TProteinSequence.count(i)
ink = 1
indexk = 0
cds = []
while ink <= num:
indexk = string.find(TProteinSequence, i, indexk) + 1
cds.append(indexk)
ink = ink + 1
if cds == []:
Result[AAPName + 'D' + i + '001'] = 0
Result[AAPName + 'D' + i + '025'] = 0
Result[AAPName + 'D' + i + '050'] = 0
Result[AAPName + 'D' + i + '075'] = 0
Result[AAPName + 'D' + i + '100'] = 0
else:
Result[AAPName + 'D' + i + '001'] = round(float(cds[0]) / Num * 100, 3)
Result[AAPName + 'D' + i + '025'] = round(float(cds[int(math.floor(num * 0.25)) - 1]) / Num * 100, 3)
Result[AAPName + 'D' + i + '050'] = round(float(cds[int(math.floor(num * 0.5)) - 1]) / Num * 100, 3)
Result[AAPName + 'D' + i + '075'] = round(float(cds[int(math.floor(num * 0.75)) - 1]) / Num * 100, 3)
Result[AAPName + 'D' + i + '100'] = round(float(cds[-1]) / Num * 100, 3)
return Result
##################################################################################################
[docs]def CalculateCompositionHydrophobicity(ProteinSequence):
"""
###############################################################################################
A method used for calculating composition descriptors based on Hydrophobicity of
AADs.
Usage:
result=CalculateCompositionHydrophobicity(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Composition descriptors based on Hydrophobicity.
###############################################################################################
"""
result = CalculateComposition(ProteinSequence, _Hydrophobicity, '_Hydrophobicity')
return result
[docs]def CalculateCompositionNormalizedVDWV(ProteinSequence):
"""
###############################################################################################
A method used for calculating composition descriptors based on NormalizedVDWV of
AADs.
Usage:
result=CalculateCompositionNormalizedVDWV(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Composition descriptors based on NormalizedVDWV.
###############################################################################################
"""
result = CalculateComposition(ProteinSequence, _NormalizedVDWV, '_NormalizedVDWV')
return result
[docs]def CalculateCompositionPolarity(ProteinSequence):
"""
###############################################################################################
A method used for calculating composition descriptors based on Polarity of
AADs.
Usage:
result=CalculateCompositionPolarity(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Composition descriptors based on Polarity.
###############################################################################################
"""
result = CalculateComposition(ProteinSequence, _Polarity, '_Polarity')
return result
[docs]def CalculateCompositionCharge(ProteinSequence):
"""
###############################################################################################
A method used for calculating composition descriptors based on Charge of
AADs.
Usage:
result=CalculateCompositionCharge(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Composition descriptors based on Charge.
###############################################################################################
"""
result = CalculateComposition(ProteinSequence, _Charge, '_Charge')
return result
[docs]def CalculateCompositionSecondaryStr(ProteinSequence):
"""
###############################################################################################
A method used for calculating composition descriptors based on SecondaryStr of
AADs.
Usage:
result=CalculateCompositionSecondaryStr(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Composition descriptors based on SecondaryStr.
###############################################################################################
"""
result = CalculateComposition(ProteinSequence, _SecondaryStr, '_SecondaryStr')
return result
[docs]def CalculateCompositionSolventAccessibility(ProteinSequence):
"""
###############################################################################################
A method used for calculating composition descriptors based on SolventAccessibility
of AADs.
Usage:
result=CalculateCompositionSolventAccessibility(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Composition descriptors based on SolventAccessibility.
###############################################################################################
"""
result = CalculateComposition(ProteinSequence, _SolventAccessibility, '_SolventAccessibility')
return result
[docs]def CalculateCompositionPolarizability(ProteinSequence):
"""
###############################################################################################
A method used for calculating composition descriptors based on Polarizability of
AADs.
Usage:
result=CalculateCompositionPolarizability(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Composition descriptors based on Polarizability.
###############################################################################################
"""
result = CalculateComposition(ProteinSequence, _Polarizability, '_Polarizability')
return result
##################################################################################################
##################################################################################################
[docs]def CalculateTransitionHydrophobicity(ProteinSequence):
"""
###############################################################################################
A method used for calculating Transition descriptors based on Hydrophobicity of
AADs.
Usage:
result=CalculateTransitionHydrophobicity(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Transition descriptors based on Hydrophobicity.
###############################################################################################
"""
result = CalculateTransition(ProteinSequence, _Hydrophobicity, '_Hydrophobicity')
return result
[docs]def CalculateTransitionNormalizedVDWV(ProteinSequence):
"""
###############################################################################################
A method used for calculating Transition descriptors based on NormalizedVDWV of
AADs.
Usage:
result=CalculateTransitionNormalizedVDWV(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Transition descriptors based on NormalizedVDWV.
###############################################################################################
"""
result = CalculateTransition(ProteinSequence, _NormalizedVDWV, '_NormalizedVDWV')
return result
[docs]def CalculateTransitionPolarity(ProteinSequence):
"""
###############################################################################################
A method used for calculating Transition descriptors based on Polarity of
AADs.
Usage:
result=CalculateTransitionPolarity(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Transition descriptors based on Polarity.
###############################################################################################
"""
result = CalculateTransition(ProteinSequence, _Polarity, '_Polarity')
return result
[docs]def CalculateTransitionCharge(ProteinSequence):
"""
###############################################################################################
A method used for calculating Transition descriptors based on Charge of
AADs.
Usage:
result=CalculateTransitionCharge(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Transition descriptors based on Charge.
###############################################################################################
"""
result = CalculateTransition(ProteinSequence, _Charge, '_Charge')
return result
[docs]def CalculateTransitionSecondaryStr(ProteinSequence):
"""
###############################################################################################
A method used for calculating Transition descriptors based on SecondaryStr of
AADs.
Usage:
result=CalculateTransitionSecondaryStr(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Transition descriptors based on SecondaryStr.
###############################################################################################
"""
result = CalculateTransition(ProteinSequence, _SecondaryStr, '_SecondaryStr')
return result
[docs]def CalculateTransitionSolventAccessibility(ProteinSequence):
"""
###############################################################################################
A method used for calculating Transition descriptors based on SolventAccessibility
of AADs.
Usage:
result=CalculateTransitionSolventAccessibility(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Transition descriptors based on SolventAccessibility.
###############################################################################################
"""
result = CalculateTransition(ProteinSequence, _SolventAccessibility, '_SolventAccessibility')
return result
[docs]def CalculateTransitionPolarizability(ProteinSequence):
"""
###############################################################################################
A method used for calculating Transition descriptors based on Polarizability of
AADs.
Usage:
result=CalculateTransitionPolarizability(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Transition descriptors based on Polarizability.
###############################################################################################
"""
result = CalculateTransition(ProteinSequence, _Polarizability, '_Polarizability')
return result
##################################################################################################
##################################################################################################
[docs]def CalculateDistributionHydrophobicity(ProteinSequence):
"""
###############################################################################################
A method used for calculating Distribution descriptors based on Hydrophobicity of
AADs.
Usage:
result=CalculateDistributionHydrophobicity(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Distribution descriptors based on Hydrophobicity.
###############################################################################################
"""
result = CalculateDistribution(ProteinSequence, _Hydrophobicity, '_Hydrophobicity')
return result
[docs]def CalculateDistributionNormalizedVDWV(ProteinSequence):
"""
###############################################################################################
A method used for calculating Distribution descriptors based on NormalizedVDWV of
AADs.
Usage:
result=CalculateDistributionNormalizedVDWV(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Distribution descriptors based on NormalizedVDWV.
###############################################################################################
"""
result = CalculateDistribution(ProteinSequence, _NormalizedVDWV, '_NormalizedVDWV')
return result
[docs]def CalculateDistributionPolarity(ProteinSequence):
"""
###############################################################################################
A method used for calculating Distribution descriptors based on Polarity of
AADs.
Usage:
result=CalculateDistributionPolarity(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Distribution descriptors based on Polarity.
###############################################################################################
"""
result = CalculateDistribution(ProteinSequence, _Polarity, '_Polarity')
return result
[docs]def CalculateDistributionCharge(ProteinSequence):
"""
###############################################################################################
A method used for calculating Distribution descriptors based on Charge of
AADs.
Usage:
result=CalculateDistributionCharge(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Distribution descriptors based on Charge.
###############################################################################################
"""
result = CalculateDistribution(ProteinSequence, _Charge, '_Charge')
return result
[docs]def CalculateDistributionSecondaryStr(ProteinSequence):
"""
###############################################################################################
A method used for calculating Distribution descriptors based on SecondaryStr of
AADs.
Usage:
result=CalculateDistributionSecondaryStr(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Distribution descriptors based on SecondaryStr.
###############################################################################################
"""
result = CalculateDistribution(ProteinSequence, _SecondaryStr, '_SecondaryStr')
return result
[docs]def CalculateDistributionSolventAccessibility(ProteinSequence):
"""
###############################################################################################
A method used for calculating Distribution descriptors based on SolventAccessibility
of AADs.
Usage:
result=CalculateDistributionSolventAccessibility(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Distribution descriptors based on SolventAccessibility.
###############################################################################################
"""
result = CalculateDistribution(ProteinSequence, _SolventAccessibility, '_SolventAccessibility')
return result
[docs]def CalculateDistributionPolarizability(ProteinSequence):
"""
###############################################################################################
A method used for calculating Distribution descriptors based on Polarizability of
AADs.
Usage:
result=CalculateDistributionPolarizability(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing Distribution descriptors based on Polarizability.
###############################################################################################
"""
result = CalculateDistribution(ProteinSequence, _Polarizability, '_Polarizability')
return result
##################################################################################################
[docs]def CalculateC(ProteinSequence):
"""
###############################################################################################
Calculate all composition descriptors based seven different properties of AADs.
Usage:
result=CalculateC(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing all composition descriptors.
###############################################################################################
"""
result = {}
result.update(CalculateCompositionPolarizability(ProteinSequence))
result.update(CalculateCompositionSolventAccessibility(ProteinSequence))
result.update(CalculateCompositionSecondaryStr(ProteinSequence))
result.update(CalculateCompositionCharge(ProteinSequence))
result.update(CalculateCompositionPolarity(ProteinSequence))
result.update(CalculateCompositionNormalizedVDWV(ProteinSequence))
result.update(CalculateCompositionHydrophobicity(ProteinSequence))
return result
[docs]def CalculateT(ProteinSequence):
"""
###############################################################################################
Calculate all transition descriptors based seven different properties of AADs.
Usage:
result=CalculateT(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing all transition descriptors.
###############################################################################################
"""
result = {}
result.update(CalculateTransitionPolarizability(ProteinSequence))
result.update(CalculateTransitionSolventAccessibility(ProteinSequence))
result.update(CalculateTransitionSecondaryStr(ProteinSequence))
result.update(CalculateTransitionCharge(ProteinSequence))
result.update(CalculateTransitionPolarity(ProteinSequence))
result.update(CalculateTransitionNormalizedVDWV(ProteinSequence))
result.update(CalculateTransitionHydrophobicity(ProteinSequence))
return result
[docs]def CalculateD(ProteinSequence):
"""
###############################################################################################
Calculate all distribution descriptors based seven different properties of AADs.
Usage:
result=CalculateD(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing all distribution descriptors.
###############################################################################################
"""
result = {}
result.update(CalculateDistributionPolarizability(ProteinSequence))
result.update(CalculateDistributionSolventAccessibility(ProteinSequence))
result.update(CalculateDistributionSecondaryStr(ProteinSequence))
result.update(CalculateDistributionCharge(ProteinSequence))
result.update(CalculateDistributionPolarity(ProteinSequence))
result.update(CalculateDistributionNormalizedVDWV(ProteinSequence))
result.update(CalculateDistributionHydrophobicity(ProteinSequence))
return result
[docs]def CalculateCTD(ProteinSequence):
"""
###############################################################################################
Calculate all CTD descriptors based seven different properties of AADs.
Usage:
result=CalculateCTD(protein)
Input:protein is a pure protein sequence.
Output:result is a dict form containing all CTD descriptors.
###############################################################################################
"""
result = {}
result.update(CalculateCompositionPolarizability(ProteinSequence))
result.update(CalculateCompositionSolventAccessibility(ProteinSequence))
result.update(CalculateCompositionSecondaryStr(ProteinSequence))
result.update(CalculateCompositionCharge(ProteinSequence))
result.update(CalculateCompositionPolarity(ProteinSequence))
result.update(CalculateCompositionNormalizedVDWV(ProteinSequence))
result.update(CalculateCompositionHydrophobicity(ProteinSequence))
result.update(CalculateTransitionPolarizability(ProteinSequence))
result.update(CalculateTransitionSolventAccessibility(ProteinSequence))
result.update(CalculateTransitionSecondaryStr(ProteinSequence))
result.update(CalculateTransitionCharge(ProteinSequence))
result.update(CalculateTransitionPolarity(ProteinSequence))
result.update(CalculateTransitionNormalizedVDWV(ProteinSequence))
result.update(CalculateTransitionHydrophobicity(ProteinSequence))
result.update(CalculateDistributionPolarizability(ProteinSequence))
result.update(CalculateDistributionSolventAccessibility(ProteinSequence))
result.update(CalculateDistributionSecondaryStr(ProteinSequence))
result.update(CalculateDistributionCharge(ProteinSequence))
result.update(CalculateDistributionPolarity(ProteinSequence))
result.update(CalculateDistributionNormalizedVDWV(ProteinSequence))
result.update(CalculateDistributionHydrophobicity(ProteinSequence))
return result
##################################################################################################
if __name__ == "__main__":
protein = "ADGCGVGEGTGQGPMCNCMCMKWVYADEDAADLESDSFADEDASLESDSFPWSNQRVFCSFADEDAS"
print CalculateCTD(protein)