Source code for PyDNApsenacutil

# -*- coding: utf-8 -*-
#  Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
#  All rights reserved.
#  This file is part of the PyBioMed.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the PyBioMed source tree.
"""
##############################################################################

A class used for computing different types of DNA descriptors! 

You can freely use and distribute it. If you have any problem, 

you could contact with us timely.

Authors: Zhijiang Yao and Dongsheng Cao.

Date: 2016.06.14

Email: gadsby@163.com and oriental-cds@163.com

##############################################################################
"""

import sys
import os
import pickle
from math import pow

from PyDNAutil import Frequency
from PyDNAnacutil import MakeKmerList


ALPHABET = 'ACGT'


[docs]def ExtendPhycheIndex(original_index, extend_index): """Extend {phyche:[value, ... ]}""" if extend_index is None or len(extend_index) == 0: return original_index for key in list(original_index.keys()): original_index[key].extend(extend_index[key]) return original_index
global phyche_factor_dic1 global phyche_factor_dic2 full_path = os.path.realpath(__file__) with open("%s/mmc3.data"%os.path.dirname(full_path), 'rb') as f: phyche_factor_dic1 = pickle.load(f) with open("%s/mmc4.data"%os.path.dirname(full_path), 'rb') as f: phyche_factor_dic2 = pickle.load(f)
[docs]def GetPhycheFactorDic(k): """Get all {nucleotide: [(phyche, value), ...]} dict.""" global phyche_factor_dic1 global phyche_factor_dic2 if 2 == k: phyche_factor_dic = phyche_factor_dic1 elif 3 == k: phyche_factor_dic = phyche_factor_dic2 else: sys.stderr.write("The k can just be 2 or 3.") sys.exit(0) return phyche_factor_dic
[docs]def GetPhycheIndex(k, phyche_list): """get phyche_value according phyche_list.""" phyche_value = {} if 0 == len(phyche_list): for nucleotide in MakeKmerList(k, ALPHABET): phyche_value[nucleotide] = [] return phyche_value nucleotide_phyche_value = GetPhycheFactorDic(k) for nucleotide in MakeKmerList(k, ALPHABET): if nucleotide not in phyche_value: phyche_value[nucleotide] = [] for e in nucleotide_phyche_value[nucleotide]: if e[0] in phyche_list: phyche_value[nucleotide].append(e[1]) return phyche_value
[docs]def ParallelCorFunction(nucleotide1, nucleotide2, phyche_index): """Get the cFactor.(Type1)""" temp_sum = 0.0 phyche_index_values = list(phyche_index.values()) len_phyche_index = len(phyche_index_values[0]) for u in range(len_phyche_index): temp_sum += pow(float(phyche_index[nucleotide1][u]) - float(phyche_index[nucleotide2][u]), 2) return temp_sum / len_phyche_index
[docs]def SeriesCorFunction(nucleotide1, nucleotide2, big_lamada, phyche_value): """Get the series correlation Factor(Type 2).""" return float(phyche_value[nucleotide1][big_lamada]) * float(phyche_value[nucleotide2][big_lamada])
[docs]def GetParallelFactor(k, lamada, sequence, phyche_value): """Get the corresponding factor theta list.""" theta = [] l = len(sequence) for i in range(1, lamada + 1): temp_sum = 0.0 for j in range(0, l - k - i + 1): nucleotide1 = sequence[j: j+k] nucleotide2 = sequence[j+i: j+i+k] temp_sum += ParallelCorFunction(nucleotide1, nucleotide2, phyche_value) theta.append(temp_sum / (l - k - i + 1)) return theta
[docs]def GetSeriesFactor(k, lamada, sequence, phyche_value): """Get the corresponding series factor theta list.""" theta = [] l_seq = len(sequence) temp_values = list(phyche_value.values()) max_big_lamada = len(temp_values[0]) for small_lamada in range(1, lamada + 1): for big_lamada in range(max_big_lamada): temp_sum = 0.0 for i in range(0, l_seq - k - small_lamada + 1): nucleotide1 = sequence[i: i+k] nucleotide2 = sequence[i+small_lamada: i+small_lamada+k] temp_sum += SeriesCorFunction(nucleotide1, nucleotide2, big_lamada, phyche_value) theta.append(temp_sum / (l_seq - k - small_lamada + 1)) return theta
[docs]def MakePsekncVector(sequence_list, lamada, w, k, phyche_value, theta_type=1): """Generate the pseknc vector.""" kmer = MakeKmerList(k, ALPHABET) vector = [] for sequence in sequence_list: if len(sequence) < k or lamada + k > len(sequence): error_info = "Sorry, the sequence length must be larger than " + str(lamada + k) sys.stderr.write(error_info) sys.exit(0) # Get the nucleotide frequency in the DNA sequence. fre_list = [Frequency(sequence, str(key)) for key in kmer] fre_sum = float(sum(fre_list)) # Get the normalized occurrence frequency of nucleotide in the DNA sequence. fre_list = [e / fre_sum for e in fre_list] # Get the theta_list according the Equation 5. if 1 == theta_type: theta_list = GetParallelFactor(k, lamada, sequence, phyche_value) elif 2 == theta_type: theta_list = GetSeriesFactor(k, lamada, sequence, phyche_value) theta_sum = sum(theta_list) # Generate the vector according the Equation 9. denominator = 1 + w * theta_sum temp_vec = [round(f / denominator, 3) for f in fre_list] for theta in theta_list: temp_vec.append(round(w * theta / denominator, 4)) vector.append(temp_vec) return vector
[docs]def GetParallelFactorPsednc(lamada, sequence, phyche_value): """Get the corresponding factor theta list. This def is just for dinucleotide.""" theta = [] l = len(sequence) for i in range(1, lamada + 1): temp_sum = 0.0 for j in range(0, l - 1 - lamada): nucleotide1 = sequence[j] + sequence[j + 1] nucleotide2 = sequence[j + i] + sequence[j + i + 1] temp_sum += ParallelCorFunction(nucleotide1, nucleotide2, phyche_value) theta.append(temp_sum / (l - i - 1)) return theta
[docs]def MakeOldPsekncVector(sequence_list, lamada, w, k, phyche_value, theta_type=1): """Generate the pseknc vector.""" kmer = MakeKmerList(k, ALPHABET) vector = [] for sequence in sequence_list: if len(sequence) < k or lamada + k > len(sequence): error_info = "Sorry, the sequence length must be larger than " + str(lamada + k) sys.stderr.write(error_info) sys.exit(0) # Get the nucleotide frequency in the DNA sequence. fre_list = [Frequency(sequence, str(key)) for key in kmer] fre_sum = float(sum(fre_list)) # Get the normalized occurrence frequency of nucleotide in the DNA sequence. fre_list = [e / fre_sum for e in fre_list] # Get the theta_list according the Equation 5. if 1 == theta_type: theta_list = GetParallelFactorPsednc(lamada, sequence, phyche_value) elif 2 == theta_type: theta_list = GetSeriesFactor(k, lamada, sequence, phyche_value) theta_sum = sum(theta_list) # Generate the vector according the Equation 9. denominator = 1 + w * theta_sum temp_vec = [round(f / denominator, 3) for f in fre_list] for theta in theta_list: temp_vec.append(round(w * theta / denominator, 4)) vector.append(temp_vec) return vector
if __name__ == '__main__': # get_phyche_index(2, ['Base stacking']) extra_phyche_index = {'AA': [0.06, 0.5, 0.27, 1.59, 0.11, -0.11, 1], 'AC': [1.50, 0.50, 0.80, 0.13, 1.29, 1.04, 1], 'AG': [0.78, 0.36, 0.09, 0.68, -0.24, -0.62, 1], 'AT': [1.07, 0.22, 0.62, -1.02, 2.51, 1.17, 1], 'CA': [-1.38, -1.36, -0.27, -0.86, -0.62, -1.25, 1], 'CC': [0.06, 1.08, 0.09, 0.56, -0.82, 0.24, 1], 'CG': [-1.66, -1.22, -0.44, -0.82, -0.29, -1.39, 1], 'CT': [0.78, 0.36, 0.09, 0.68, -0.24, -0.62, 1], 'GA': [-0.08, 0.5, 0.27, 0.13, -0.39, 0.71, 1], 'GC': [-0.08, 0.22, 1.33, -0.35, 0.65, 1.59, 1], 'GG': [0.06, 1.08, 0.09, 0.56, -0.82, 0.24, 1], 'GT': [1.50, 0.50, 0.80, 0.13, 1.29, 1.04, 1], 'TA': [-1.23, -2.37, -0.44, -2.24, -1.51, -1.39, 1], 'TC': [-0.08, 0.5, 0.27, 0.13, -0.39, 0.71, 1], 'TG': [-1.38, -1.36, -0.27, -0.86, -0.62, -1.25, 1], 'TT': [0.06, 0.5, 0.27, 1.59, 0.11, -0.11, 1]} phyche_index = ExtendPhycheIndex(GetPhycheIndex(k=2, phyche_list=['Base stacking', 'DNA denaturation']), extra_phyche_index) #print(phyche_index) import pandas as pd df = pd.DataFrame(GetPhycheFactorDic(3))