Source code for ConjointTriad

# -*- coding: utf-8 -*-
#  Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
#  All rights reserved.
#  This file is part of the PyBioMed.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the PyBioMed source tree.
"""
###############################################################################
This module is used for calculating the conjoint triad features only from the 

protein sequence information. You can get 7*7*7=343 features.You can freely 

use and distribute it. If you hava any problem, you could contact with us timely!

Reference:

Juwen Shen, Jian Zhang, Xiaomin Luo, Weiliang Zhu, Kunqian Yu, Kaixian Chen, 

Yixue Li, Huanliang Jiang. Predicting proten-protein interactions based only 

on sequences inforamtion. PNAS. 2007 (104) 4337-4341.

Authors: Zhijiang Yao and Dongsheng Cao.

Date: 2016.06.04

Email: gadsby@163.com

###############################################################################
"""

import string

###############################################################################
AALetter = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]

# a Dipole scale (Debye): -, Dipole<1.0; +, 1.0<Dipole<2.0; ++, 2.0<Dipole<3.0; +++, Dipole>3.0; +'+'+', Dipole>3.0 with opposite orientation.
# b Volume scale (Å3): -, Volume<50; +, Volume> 50.
# c Cys is separated from class 3 because of its ability to form disulfide bonds.

_repmat = {1: ["A", 'G', 'V'], 2: ['I', 'L', 'F', 'P'], 3: ['Y', 'M', 'T', 'S'], 4: ['H', 'N', 'Q', 'W'], 5: ['R', 'K'],
           6: ['D', 'E'], 7: ['C']}


###############################################################################

def _Str2Num(proteinsequence):
    """
    translate the amino acid letter into the corresponding class based on the

    given form.

    """
    repmat = {}
    for i in _repmat:
        for j in _repmat[i]:
            repmat[j] = i

    res = proteinsequence
    for i in repmat:
        res = res.replace(i, str(repmat[i]))
    return res


###############################################################################
[docs]def CalculateConjointTriad(proteinsequence): """ Calculate the conjoint triad features from protein sequence. Useage: res = CalculateConjointTriad(protein) Input: protein is a pure protein sequence. Output is a dict form containing all 343 conjoint triad features. """ res = {} proteinnum = _Str2Num(proteinsequence) for i in range(8): for j in range(8): for k in range(8): temp = str(i) + str(j) + str(k) res[temp] = proteinnum.count(temp) return res
############################################################################### if __name__ == "__main__": protein = "ADGCGVGEGTGQGPMCNCMCMKWVYADEDAADLESDSFADEDASLESDSFPWSNQRVFCSFADEDAS" print CalculateConjointTriad(protein)