Source code for PyDNApsenac

# -*- coding: utf-8 -*-
#  Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
#  All rights reserved.
#  This file is part of the PyBioMed.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the PyBioMed source tree.
"""
##############################################################################

A class used for computing different types of DNA descriptors! 

You can freely use and distribute it. If you have any problem, 

you could contact with us timely.

Authors: Zhijiang Yao and Dongsheng Cao.

Date: 2016.06.14

Email: gadsby@163.com and oriental-cds@163.com

##############################################################################
"""

from PyDNAutil import GetData
from PyDNApsenacutil import ExtendPhycheIndex


[docs]def CheckPsenac(lamada, w, k): """Check the validation of parameter lamada, w and k. """ try: if not isinstance(lamada, int) or lamada <= 0: raise ValueError("Error, parameter lamada must be an int type and larger than and equal to 0.") elif w > 1 or w < 0: raise ValueError("Error, parameter w must be ranged from 0 to 1.") elif not isinstance(k, int) or k <= 0: raise ValueError("Error, parameter k must be an int type and larger than 0.") except ValueError: raise
[docs]def GetSequenceListAndPhycheValuePsednc(input_data, extra_phyche_index=None): """For PseDNC, PseKNC, make sequence_list and phyche_value. :param input_data: file type or handle. :param extra_phyche_index: dict, the key is the dinucleotide (string), the value is its physicochemical property value (list). It means the user-defined physicochemical indices. """ if extra_phyche_index is None: extra_phyche_index = {} original_phyche_value = {'AA': [0.06, 0.5, 0.27, 1.59, 0.11, -0.11], 'AC': [1.50, 0.50, 0.80, 0.13, 1.29, 1.04], 'AG': [0.78, 0.36, 0.09, 0.68, -0.24, -0.62], 'AT': [1.07, 0.22, 0.62, -1.02, 2.51, 1.17], 'CA': [-1.38, -1.36, -0.27, -0.86, -0.62, -1.25], 'CC': [0.06, 1.08, 0.09, 0.56, -0.82, 0.24], 'CG': [-1.66, -1.22, -0.44, -0.82, -0.29, -1.39], 'CT': [0.78, 0.36, 0.09, 0.68, -0.24, -0.62], 'GA': [-0.08, 0.5, 0.27, 0.13, -0.39, 0.71], 'GC': [-0.08, 0.22, 1.33, -0.35, 0.65, 1.59], 'GG': [0.06, 1.08, 0.09, 0.56, -0.82, 0.24], 'GT': [1.50, 0.50, 0.80, 0.13, 1.29, 1.04], 'TA': [-1.23, -2.37, -0.44, -2.24, -1.51, -1.39], 'TC': [-0.08, 0.5, 0.27, 0.13, -0.39, 0.71], 'TG': [-1.38, -1.36, -0.27, -0.86, -0.62, -1.25], 'TT': [0.06, 0.5, 0.27, 1.59, 0.11, -0.11]} sequence_list = GetData(input_data) phyche_value = ExtendPhycheIndex(original_phyche_value, extra_phyche_index) return sequence_list, phyche_value
[docs]def GetSequenceListAndPhycheValuePseknc(input_data, extra_phyche_index=None): """For PseDNC, PseKNC, make sequence_list and phyche_value. :param input_data: file type or handle. :param extra_phyche_index: dict, the key is the dinucleotide (string), the value is its physicochemical property value (list). It means the user-defined physicochemical indices. """ if extra_phyche_index is None: extra_phyche_index = {} original_phyche_value = { 'AA': [0.06, 0.5, 0.09, 1.59, 0.11, -0.11], 'AC': [1.5, 0.5, 1.19, 0.13, 1.29, 1.04], 'GT': [1.5, 0.5, 1.19, 0.13, 1.29, 1.04], 'AG': [0.78, 0.36, -0.28, 0.68, -0.24, -0.62], 'CC': [0.06, 1.08, -0.28, 0.56, -0.82, 0.24], 'CA': [-1.38, -1.36, -1.01, -0.86, -0.62, -1.25], 'CG': [-1.66, -1.22, -1.38, -0.82, -0.29, -1.39], 'TT': [0.06, 0.5, 0.09, 1.59, 0.11, -0.11], 'GG': [0.06, 1.08, -0.28, 0.56, -0.82, 0.24], 'GC': [-0.08, 0.22, 2.3, -0.35, 0.65, 1.59], 'AT': [1.07, 0.22, 0.83, -1.02, 2.51, 1.17], 'GA': [-0.08, 0.5, 0.09, 0.13, -0.39, 0.71], 'TG': [-1.38, -1.36, -1.01, -0.86, -0.62, -1.25], 'TA': [-1.23, -2.37, -1.38, -2.24, -1.51, -1.39], 'TC': [-0.08, 0.5, 0.09, 0.13, -0.39, 0.71], 'CT': [0.78, 0.36, -0.28, 0.68, -0.24, -0.62]} sequence_list = GetData(input_data) phyche_value = ExtendPhycheIndex(original_phyche_value, extra_phyche_index) return sequence_list, phyche_value
[docs]def GetSequenceListAndPhycheValue(input_data, k, phyche_index, extra_phyche_index, all_property): """For PseKNC-general make sequence_list and phyche_value. :param input_data: file type or handle. :param k: int, the value of k-tuple. :param k: physicochemical properties list. :param extra_phyche_index: dict, the key is the dinucleotide (string), the value is its physicochemical property value (list). It means the user-defined physicochemical indices. :param all_property: bool, choose all physicochemical properties or not. """ if phyche_index is None: phyche_index = [] if extra_phyche_index is None: extra_phyche_index = {} diphyche_list = ['Base stacking', 'Protein induced deformability', 'B-DNA twist', 'Dinucleotide GC Content', 'A-philicity', 'Propeller twist', 'Duplex stability:(freeenergy)', 'Duplex tability(disruptenergy)', 'DNA denaturation', 'Bending stiffness', 'Protein DNA twist', 'Stabilising energy of Z-DNA', 'Aida_BA_transition', 'Breslauer_dG', 'Breslauer_dH', 'Breslauer_dS', 'Electron_interaction', 'Hartman_trans_free_energy', 'Helix-Coil_transition', 'Ivanov_BA_transition', 'Lisser_BZ_transition', 'Polar_interaction', 'SantaLucia_dG', 'SantaLucia_dH', 'SantaLucia_dS', 'Sarai_flexibility', 'Stability', 'Stacking_energy', 'Sugimoto_dG', 'Sugimoto_dH', 'Sugimoto_dS', 'Watson-Crick_interaction', 'Twist', 'Tilt', 'Roll', 'Shift', 'Slide', 'Rise'] triphyche_list = ['Dnase I', 'Bendability (DNAse)', 'Bendability (consensus)', 'Trinucleotide GC Content', 'Nucleosome positioning', 'Consensus_roll', 'Consensus-Rigid', 'Dnase I-Rigid', 'MW-Daltons', 'MW-kg', 'Nucleosome', 'Nucleosome-Rigid'] # Set and check physicochemical properties. phyche_list = [] if k == 2: phyche_list = diphyche_list elif k == 3: phyche_list = triphyche_list try: if all_property is True: phyche_index = phyche_list else: for e in phyche_index: if e not in phyche_list: error_info = 'Sorry, the physicochemical properties ' + e + ' is not exit.' raise NameError(error_info) except NameError: raise # Generate phyche_value and sequence_list. from PyDNApsenacutil import GetPhycheIndex phyche_value = ExtendPhycheIndex(GetPhycheIndex(k, phyche_index), extra_phyche_index) sequence_list = GetData(input_data) return sequence_list, phyche_value
[docs]def GetPseDNC(input_data,**kwargs): """Make PseDNC dictionary. :param input_data: file type or handle. :param k: k-tuple. :param extra_phyche_index: dict, the key is the dinucleotide (string), the value is its physicochemical property value (list). It means the user-defined physicochemical indices. """ if 'lamada' in kwargs: lamada = kwargs['lamada'] else: lamada = 3 if 'w' in kwargs: w =kwargs['w'] else: w = 0.05 if 'k' in kwargs: k = kwargs['k'] else: k = 2 if 'extra_phyche_index' in kwargs: kwargs = kwargs['extra_phyche_index'] else: extra_phyche_index = None input_data = [input_data] sequence_list, phyche_value = GetSequenceListAndPhycheValuePsednc(input_data, extra_phyche_index) from PyDNApsenacutil import MakePsekncVector vector = MakePsekncVector(sequence_list, lamada, w, k, phyche_value, theta_type=1) dict_keys = ['PseDNC_%s'%i for i in range(1,len(vector[0])+1)] res = dict(zip(dict_keys,vector[0])) return res
[docs]def GetPseKNC(input_data,**kwargs): """Make PseKNC dictionary. :param input_data: file type or handle. :param k: k-tuple. :param extra_phyche_index: dict, the key is the dinucleotide (string), the value is its physicochemical property value (list). It means the user-defined physicochemical indices. """ if 'lamada' in kwargs: lamada = kwargs['lamada'] else: lamada = 1 if 'w' in kwargs: w =kwargs['w'] else: w = 0.5 if 'k' in kwargs: k = kwargs['k'] else: k = 3 if 'extra_phyche_index' in kwargs: kwargs = kwargs['extra_phyche_index'] else: extra_phyche_index = None input_data = [input_data] sequence_list, phyche_value = GetSequenceListAndPhycheValuePseknc(input_data, extra_phyche_index) from PyDNApsenacutil import MakeOldPsekncVector vector = MakeOldPsekncVector(sequence_list, lamada, w, k, phyche_value, theta_type=1) dict_keys = ['PseKNC_%s'%i for i in range(1,len(vector[0])+1)] res = dict(zip(dict_keys,vector[0])) return res
[docs]def GetPCPseDNC(input_data,**kwargs): """Make a PCPseDNC dictionary. :param input_data: file object or sequence list. :param phyche_index: physicochemical properties list. :param all_property: choose all physicochemical properties or not. :param extra_phyche_index: dict, the key is the dinucleotide (string), the value is its physicochemical property value (list). It means the user-defined physicochemical indices. """ if 'lamada' in kwargs: lamada = kwargs['lamada'] else: lamada = 1 if 'w' in kwargs: w =kwargs['w'] else: w = 0.05 if 'k' in kwargs: k = kwargs['k'] else: k = 2 if 'phyche_index' in kwargs: phyche_index = kwargs['phyche_index'] else: phyche_index = None if 'all_property' in kwargs: all_property = kwargs['all_property'] else: all_property = False if 'extra_phyche_index' in kwargs: extra_phyche_index = kwargs['extra_phyche_index'] else: extra_phyche_index = None # Make vector. input_data = [input_data] sequence_list, phyche_value = GetSequenceListAndPhycheValue(input_data, k, phyche_index, extra_phyche_index, all_property) from PyDNApsenacutil import MakePsekncVector vector = MakePsekncVector(sequence_list, lamada, w, k, phyche_value, theta_type=1) dict_keys = ['PCPseDNC_%s'%i for i in range(1,len(vector[0])+1)] res = dict(zip(dict_keys,vector[0])) return res
[docs]def GetPCPseTNC(input_data, **kwargs): """Make a PCPseDNC dictionary. :param input_data: file object or sequence list. :param phyche_index: physicochemical properties list. :param all_property: choose all physicochemical properties or not. :param extra_phyche_index: dict, the key is the dinucleotide (string), the value is its physicochemical property value (list). It means the user-defined physicochemical indices. """ if 'lamada' in kwargs: lamada = kwargs['lamada'] else: lamada = 1 if 'w' in kwargs: w =kwargs['w'] else: w = 0.05 if 'k' in kwargs: k = kwargs['k'] else: k = 3 if 'phyche_index' in kwargs: phyche_index = kwargs['phyche_index'] else: phyche_index = None if 'all_property' in kwargs: all_property = kwargs['all_property'] else: all_property = False if 'extra_phyche_index' in kwargs: extra_phyche_index = kwargs['extra_phyche_index'] else: extra_phyche_index = None input_data = [input_data] sequence_list, phyche_value = GetSequenceListAndPhycheValue(input_data, k, phyche_index, extra_phyche_index, all_property) # Make vector. from PyDNApsenacutil import MakePsekncVector vector = MakePsekncVector(sequence_list, lamada, w, k, phyche_value, theta_type=1) dict_keys = ['PCPseTNC_%s'%i for i in range(1,len(vector[0])+1)] res = dict(zip(dict_keys,vector[0])) return res
[docs]def GetSCPseDNC(input_data, **kwargs): """Make a SCPseDNC dictionary. :param input_data: file object or sequence list. :param phyche_index: physicochemical properties list. :param all_property: choose all physicochemical properties or not. :param extra_phyche_index: dict, the key is the dinucleotide (string), the value is its physicochemical property value (list). It means the user-defined physicochemical indices. """ if 'lamada' in kwargs: lamada = kwargs['lamada'] else: lamada = 1 if 'w' in kwargs: w =kwargs['w'] else: w = 0.05 if 'k' in kwargs: k = kwargs['k'] else: k = 2 if 'phyche_index' in kwargs: phyche_index = kwargs['phyche_index'] else: phyche_index = None if 'all_property' in kwargs: all_property = kwargs['all_property'] else: all_property = False if 'extra_phyche_index' in kwargs: extra_phyche_index = kwargs['extra_phyche_index'] else: extra_phyche_index = None input_data = [input_data] sequence_list, phyche_value = GetSequenceListAndPhycheValue(input_data, k, phyche_index, extra_phyche_index, all_property) # Make vector. from PyDNApsenacutil import MakePsekncVector vector = MakePsekncVector(sequence_list, lamada, w, k, phyche_value, theta_type=2) dict_keys = ['SCPseDNC_%s'%i for i in range(1,len(vector[0])+1)] res = dict(zip(dict_keys,vector[0])) return res
[docs]def GetSCPseTNC(input_data, **kwargs): """Make a SCPseTNC dictionary. :param input_data: file object or sequence list. :param phyche_index: physicochemical properties list. :param all_property: choose all physicochemical properties or not. :param extra_phyche_index: dict, the key is the dinucleotide (string), the value is its physicochemical property value (list). It means the user-defined physicochemical indices. """ if 'lamada' in kwargs: lamada = kwargs['lamada'] else: lamada = 1 if 'w' in kwargs: w =kwargs['w'] else: w = 0.05 if 'k' in kwargs: k = kwargs['k'] else: k = 3 if 'phyche_index' in kwargs: phyche_index = kwargs['phyche_index'] else: phyche_index = None if 'all_property' in kwargs: all_property = kwargs['all_property'] else: all_property = False if 'extra_phyche_index' in kwargs: extra_phyche_index = kwargs['extra_phyche_index'] else: extra_phyche_index = None input_data = [input_data] sequence_list, phyche_value = GetSequenceListAndPhycheValue(input_data, k, phyche_index, extra_phyche_index, all_property) # Make vector. from PyDNApsenacutil import MakePsekncVector vector = MakePsekncVector(sequence_list, lamada, w, k, phyche_value, theta_type=2) dict_keys = ['SCPseTNC_%s'%i for i in range(1,len(vector[0])+1)] res = dict(zip(dict_keys,vector[0])) return res
if __name__ == '__main__': psednc = GetPseDNC('ACCCCA',lamada=2, w=0.05) print(psednc) PC_psednc = GetPCPseDNC('ACCCCA', phyche_index=["Tilt", 'Twist', 'Rise', 'Roll', 'Shift', 'Slide'],lamada=2, w=0.05) print(PC_psednc) pc_psetnc = GetPCPseTNC('ACCCCA', phyche_index=['Dnase I', 'Nucleosome'],lamada=2, w=0.05) print(pc_psetnc) sc_psednc = GetSCPseDNC('ACCCCCA', phyche_index=['Twist', 'Tilt'],lamada=2, w=0.05) print(sc_psednc) sc_psetnc = GetSCPseTNC('ACCCCCA', phyche_index=['Dnase I', 'Nucleosome'],lamada=1, w=0.05) print(sc_psetnc) sc_psetnc = GetSCPseTNC('ACCCCA', phyche_index=["Dnase I", 'Nucleosome'], lamada=2, w=0.05) print(sc_psetnc) import time from PyDNAutil import NormalizeIndex start_time = time.time() phyche_index = [[1.019, -0.918, 0.488, 0.567, 0.567, -0.070, -0.579, 0.488, -0.654, -2.455, -0.070, -0.918, 1.603, -0.654, 0.567, 1.019]] print('Begin PseDNC') dic = GetPseDNC('GACTGAACTGCACTTTGGTTTCATATTATTTGCTC') print(dic) print(len(dic)) dic = GetPseKNC('GACTGAACTGCACTTTGGTTTCATATTATTTGCTC') print(dic) print(len(dic)) print('PC-PseDNC') dic = GetPCPseDNC('GACTGAACTGCACTTTGGTTTCATATTATTTGCTC', phyche_index=['Twist', 'Tilt']) print(dic) print(len(dic)) dic = GetPCPseTNC('GACTGAACTGCACTTTGGTTTCATATTATTTGCTC',lamada=1, w=0.05,k=2,phyche_index=['Twist', 'Tilt']) print(dic) print(len(dic)) phyche_index = [ [7.176, 6.272, 4.736, 7.237, 3.810, 4.156, 4.156, 6.033, 3.410, 3.524, 4.445, 6.033, 1.613, 5.087, 2.169, 7.237, 3.581, 3.239, 1.668, 2.169, 6.813, 3.868, 5.440, 4.445, 3.810, 4.678, 5.440, 4.156, 2.673, 3.353, 1.668, 4.736, 4.214, 3.925, 3.353, 5.087, 2.842, 2.448, 4.678, 3.524, 3.581, 2.448, 3.868, 4.156, 3.467, 3.925, 3.239, 6.272, 2.955, 3.467, 2.673, 1.613, 1.447, 3.581, 3.810, 3.410, 1.447, 2.842, 6.813, 3.810, 2.955, 4.214, 3.581, 7.176] ] from PyDNAutil import NormalizeIndex dic = GetPCPseTNC('GACTGAACTGCACTTTGGTTTCATATTATTTGCTC', phyche_index=['Dnase I', 'Nucleosome'], extra_phyche_index=NormalizeIndex(phyche_index, is_convert_dict=True)) print(dic) print(len(dic)) print('SC-PseDNC') dic = GetSCPseDNC('GACTGAACTGCACTTTGGTTTCATATTATTTGCTC', phyche_index=['Twist', 'Tilt']) print(dic) print(len(dic)) dic = GetSCPseDNC('GACTGAACTGCACTTTGGTTTCATATTATTTGCTC', all_property=True,lamada=2, w=0.05) print(dic) print(len(dic)) phyche_index = [[1.019, -0.918, 0.488, 0.567, 0.567, -0.070, -0.579, 0.488, -0.654, -2.455, -0.070, -0.918, 1.603, -0.654, 0.567, 1.019]] from PyDNAutil import NormalizeIndex dic = GetSCPseDNC('GACTGAACTGCACTTTGGTTTCATATTATTTGCTC', phyche_index=['Twist', 'Tilt'], extra_phyche_index=NormalizeIndex(phyche_index, is_convert_dict=True)) print(dic) print(len(dic)) print() print('SC-PseTNC') dic= GetSCPseTNC('GACTGAACTGCACTTTGGTTTCATATTATTTGCTC', phyche_index=['Dnase I', 'Nucleosome']) print(dic) print(len(dic)) dic = GetSCPseTNC('GACTGAACTGCACTTTGGTTTCATATTATTTGCTC', all_property=True,lamada=2, w=0.05) print(dic) print(len(dic)) phyche_index = [ [7.176, 6.272, 4.736, 7.237, 3.810, 4.156, 4.156, 6.033, 3.410, 3.524, 4.445, 6.033, 1.613, 5.087, 2.169, 7.237, 3.581, 3.239, 1.668, 2.169, 6.813, 3.868, 5.440, 4.445, 3.810, 4.678, 5.440, 4.156, 2.673, 3.353, 1.668, 4.736, 4.214, 3.925, 3.353, 5.087, 2.842, 2.448, 4.678, 3.524, 3.581, 2.448, 3.868, 4.156, 3.467, 3.925, 3.239, 6.272, 2.955, 3.467, 2.673, 1.613, 1.447, 3.581, 3.810, 3.410, 1.447, 2.842, 6.813, 3.810, 2.955, 4.214, 3.581, 7.176] ] from PyDNAutil import NormalizeIndex dic = GetSCPseTNC('GACTGAACTGCACTTTGGTTTCATATTATTTGCTC', phyche_index=['Dnase I', 'Nucleosome'], extra_phyche_index=NormalizeIndex(phyche_index, is_convert_dict=True)) print(dic) print(len(dic)) # Normalize PseDNC index Twist, Tilt, Roll, Shift, Slide, Rise. original_phyche_value = [ [0.026, 0.036, 0.031, 0.033, 0.016, 0.026, 0.014, 0.031, 0.025, 0.025, 0.026, 0.036, 0.017, 0.025, 0.016, 0.026], [0.038, 0.038, 0.037, 0.036, 0.025, 0.042, 0.026, 0.037, 0.038, 0.036, 0.042, 0.038, 0.018, 0.038, 0.025, 0.038], [0.020, 0.023, 0.019, 0.022, 0.017, 0.019, 0.016, 0.019, 0.020, 0.026, 0.019, 0.023, 0.016, 0.020, 0.017, 0.020], [1.69, 1.32, 1.46, 1.03, 1.07, 1.43, 1.08, 1.46, 1.32, 1.20, 1.43, 1.32, 0.72, 1.32, 1.07, 1.69], [2.26, 3.03, 2.03, 3.83, 1.78, 1.65, 2.00, 2.03, 1.93, 2.61, 1.65, 3.03, 1.20, 1.93, 1.78, 2.26], [7.65, 8.93, 7.08, 9.07, 6.38, 8.04, 6.23, 7.08, 8.56, 9.53, 8.04, 8.93, 6.23, 8.56, 6.38, 7.65]] for e in NormalizeIndex(original_phyche_value, is_convert_dict=True).items(): print(e)