# -*- coding: utf-8 -*-
# Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
# All rights reserved.
# This file is part of the PyBioMed.
# The contents are covered by the terms of the BSD license
# which is included in the file license.txt, found at the root
# of the PyBioMed source tree.
"""
Created on Wed May 18 14:06:37 2016
@author: yzj
"""
import sys
ALPHABET = 'ACGT'
"""Used for process original data."""
[docs]class Seq:
def __init__(self, name, seq, no):
self.name = name
self.seq = seq.upper()
self.no = no
self.length = len(seq)
def __str__(self):
"""Output seq when 'print' method is called."""
return "%s\tNo:%s\tlength:%s\n%s" % (self.name, str(self.no), str(self.length), self.seq)
[docs]def IsUnderAlphabet(s, alphabet):
"""
#################################################################
Judge the string is within the scope of the alphabet or not.
:param s: The string.
:param alphabet: alphabet.
Return True or the error character.
#################################################################
"""
for e in s:
if e not in alphabet:
return e
return True
[docs]def IsFasta(seq):
"""
#################################################################
Judge the Seq object is in FASTA format.
Two situation:
1. No seq name.
2. Seq name is illegal.
3. No sequence.
:param seq: Seq object.
#################################################################
"""
if not seq.name:
error_info = 'Error, sequence ' + str(seq.no) + ' has no sequence name.'
print(seq)
sys.stderr.write(error_info)
return False
if -1 != seq.name.find('>'):
error_info = 'Error, sequence ' + str(seq.no) + ' name has > character.'
sys.stderr.write(error_info)
return False
if 0 == seq.length:
error_info = 'Error, sequence ' + str(seq.no) + ' is null.'
sys.stderr.write(error_info)
return False
return True
[docs]def ReadFasta(f):
"""
#################################################################
Read a fasta file.
:param f: HANDLE to input. e.g. sys.stdin, or open(<file>)
Return Seq obj list.
#################################################################
"""
name, seq = '', ''
count = 0
seq_list = []
lines = f.readlines()
for line in lines:
if not line:
break
if '>' == line[0]:
if 0 != count or (0 == count and seq != ''):
if IsFasta(Seq(name, seq, count)):
seq_list.append(Seq(name, seq, count))
else:
sys.exit(0)
seq = ''
name = line[1:].strip()
count += 1
else:
seq += line.strip()
count += 1
if IsFasta(Seq(name, seq, count)):
seq_list.append(Seq(name, seq, count))
else:
sys.exit(0)
return seq_list
[docs]def ReadFastaYield(f):
"""
#################################################################
Yields a Seq object.
:param f: HANDLE to input. e.g. sys.stdin, or open(<file>)
#################################################################
"""
name, seq = '', ''
count = 0
while True:
line = f.readline()
if not line:
break
if '>' == line[0]:
if 0 != count or (0 == count and seq != ''):
if IsFasta(Seq(name, seq, count)):
yield Seq(name, seq, count)
else:
sys.exit(0)
seq = ''
name = line[1:].strip()
count += 1
else:
seq += line.strip()
if IsFasta(Seq(name, seq, count)):
yield Seq(name, seq, count)
else:
sys.exit(0)
[docs]def ReadFastaCheckDna(f):
"""
#################################################################
Read the fasta file, and check its legality.
:param f: HANDLE to input. e.g. sys.stdin, or open(<file>)
Return the seq list.
#################################################################
"""
seq_list = []
for e in ReadFastaYield(f):
# print e
res = IsUnderAlphabet(e.seq, ALPHABET)
if res:
seq_list.append(e)
else:
error_info = 'Sorry, sequence ' + str(e.no) \
+ ' has character ' + str(res) + '.(The character must be A or C or G or T)'
sys.stderr(error_info)
sys.exit(0)
return seq_list
[docs]def GetSequenceCheckDna(f):
"""
#################################################################
Read the fasta file.
Input: f: HANDLE to input. e.g. sys.stdin, or open(<file>)
Return the sequence list.
#################################################################
"""
sequence_list = []
for e in ReadFastaYield(f):
# print e
res = IsUnderAlphabet(e.seq, ALPHABET)
if res is not True:
error_info = 'Sorry, sequence ' + str(e.no) \
+ ' has character ' + str(res) + '.(The character must be A, C, G or T)'
sys.stderr.write(error_info)
sys.exit(0)
else:
sequence_list.append(e.seq)
return sequence_list
[docs]def IsSequenceList(sequence_list):
"""
#################################################################
Judge the sequence list is within the scope of alphabet and
change the lowercase to capital.
#################################################################
"""
count = 0
new_sequence_list = []
for e in sequence_list:
e = e.upper()
count += 1
res = IsUnderAlphabet(e, ALPHABET)
if res is not True:
error_info = 'Sorry, sequence ' + str(count) \
+ ' has illegal character ' + str(res) + '.(The character must be A, C, G or T)'
sys.stderr.write(error_info)
return False
else:
new_sequence_list.append(e)
return new_sequence_list
[docs]def GetData(input_data, desc=False):
"""
#################################################################
Get sequence data from file or list with check.
:param input_data: type file or list
:param desc: with this option, the return value will be a Seq object list(it only works in file object).
:return: sequence data or shutdown.
#################################################################
"""
if hasattr(input_data, 'read'):
if desc is False:
return GetSequenceCheckDna(input_data)
else:
return ReadFastaCheckDna(input_data)
elif isinstance(input_data, list):
input_data = IsSequenceList(input_data)
if input_data is not False:
return input_data
else:
sys.exit(0)
else:
error_info = 'Sorry, the parameter in get_data method must be list or file type.'
sys.stderr.write(error_info)
sys.exit(0)
"""Some basic function for generate feature vector."""
[docs]def Frequency(tol_str, tar_str):
"""
#################################################################
Generate the frequency of tar_str in tol_str.
:param tol_str: mother string.
:param tar_str: substring.
#################################################################
"""
i, j, tar_count = 0, 0, 0
len_tol_str = len(tol_str)
len_tar_str = len(tar_str)
while i < len_tol_str and j < len_tar_str:
if tol_str[i] == tar_str[j]:
i += 1
j += 1
if j >= len_tar_str:
tar_count += 1
i = i - j + 1
j = 0
else:
i = i - j + 1
j = 0
return tar_count
[docs]def WriteLibsvm(vector_list, label_list, write_file):
"""
#################################################################
Write the vector into disk in libSVM format.
#################################################################
"""
len_vector_list = len(vector_list)
len_label_list = len(label_list)
if len_vector_list == 0:
sys.stderr.write("The vector is none.")
sys.exit(1)
if len_label_list == 0:
sys.stderr.write("The label is none.")
sys.exit(1)
if len_vector_list != len_label_list:
sys.stderr.write("The length of vector and label is different.")
sys.exit(1)
with open(write_file, 'w') as f:
len_vector = len(vector_list[0])
for i in range(len_vector_list):
temp_write = str(label_list[i])
for j in range(0, len_vector):
temp_write += ' ' + str(j + 1) + ':' + str(vector_list[i][j])
f.write(temp_write)
f.write('\n')
[docs]def GeneratePhycheValue(k, phyche_index=None, all_property=False, extra_phyche_index=None):
"""
#################################################################
Combine the user selected phyche_list, is_all_property and
extra_phyche_index to a new standard phyche_value.
#################################################################
"""
if phyche_index is None:
phyche_index = []
if extra_phyche_index is None:
extra_phyche_index = {}
diphyche_list = ['Base stacking', 'Protein induced deformability', 'B-DNA twist', 'Dinucleotide GC Content',
'A-philicity', 'Propeller twist', 'Duplex stability:(freeenergy)',
'Duplex tability(disruptenergy)', 'DNA denaturation', 'Bending stiffness', 'Protein DNA twist',
'Stabilising energy of Z-DNA', 'Aida_BA_transition', 'Breslauer_dG', 'Breslauer_dH',
'Breslauer_dS', 'Electron_interaction', 'Hartman_trans_free_energy', 'Helix-Coil_transition',
'Ivanov_BA_transition', 'Lisser_BZ_transition', 'Polar_interaction', 'SantaLucia_dG',
'SantaLucia_dH', 'SantaLucia_dS', 'Sarai_flexibility', 'Stability', 'Stacking_energy',
'Sugimoto_dG', 'Sugimoto_dH', 'Sugimoto_dS', 'Watson-Crick_interaction', 'Twist', 'Tilt',
'Roll', 'Shift', 'Slide', 'Rise']
triphyche_list = ['Dnase I', 'Bendability (DNAse)', 'Bendability (consensus)', 'Trinucleotide GC Content',
'Nucleosome positioning', 'Consensus_roll', 'Consensus-Rigid', 'Dnase I-Rigid', 'MW-Daltons',
'MW-kg', 'Nucleosome', 'Nucleosome-Rigid']
# Set and check physicochemical properties.
if 2 == k:
if all_property is True:
phyche_index = diphyche_list
else:
for e in phyche_index:
if e not in diphyche_list:
error_info = 'Sorry, the physicochemical properties ' + e + ' is not exit.'
import sys
sys.stderr.write(error_info)
sys.exit(0)
elif 3 == k:
if all_property is True:
phyche_index = triphyche_list
else:
for e in phyche_index:
if e not in triphyche_list:
error_info = 'Sorry, the physicochemical properties ' + e + ' is not exit.'
import sys
sys.stderr.write(error_info)
sys.exit(0)
# Generate phyche_value.
from PyBioMed.PyDNA.PyDNApsenacutil import GetPhycheIndex, ExtendPhycheIndex
return ExtendPhycheIndex(GetPhycheIndex(k, phyche_index), extra_phyche_index)
[docs]def ConvertPhycheIndexToDict(phyche_index):
"""
#################################################################
Convert phyche index from list to dict.
#################################################################
"""
# for e in phyche_index:
# print e
len_index_value = len(phyche_index[0])
k = 0
for i in range(1, 10):
if len_index_value < 4**i:
error_infor = 'Sorry, the number of each index value is must be 4^k.'
sys.stdout.write(error_infor)
sys.exit(0)
if len_index_value == 4**i:
k = i
break
from PyBioMed.PyDNA.PyDNAnacutil import MakeKmerList
kmer_list = MakeKmerList(k, ALPHABET)
# print kmer_list
len_kmer = len(kmer_list)
phyche_index_dict = {}
for kmer in kmer_list:
phyche_index_dict[kmer] = []
# print phyche_index_dict
phyche_index = list(zip(*phyche_index))
for i in range(len_kmer):
phyche_index_dict[kmer_list[i]] = list(phyche_index[i])
return phyche_index_dict
[docs]def StandardDeviation(value_list):
"""
#################################################################
Return standard deviation.
#################################################################
"""
from math import sqrt
from math import pow
n = len(value_list)
average_value = sum(value_list) * 1.0 / n
return sqrt(sum([pow(e - average_value, 2) for e in value_list]) * 1.0 / (n-1))
[docs]def NormalizeIndex(phyche_index, is_convert_dict=False):
"""
#################################################################
Normalize the physicochemical index.
#################################################################
"""
normalize_phyche_value = []
for phyche_value in phyche_index:
average_phyche_value = sum(phyche_value) * 1.0 / len(phyche_value)
sd_phyche = StandardDeviation(phyche_value)
normalize_phyche_value.append([round((e - average_phyche_value) / sd_phyche, 2) for e in phyche_value])
if is_convert_dict is True:
return ConvertPhycheIndexToDict(normalize_phyche_value)
return normalize_phyche_value
[docs]def DNAChecks(s):
for e in s:
if e not in ALPHABET:
return e
return True
if __name__ == "__main__":
re = ['GACTGAACTGCACTTTGGTTTCATATTATTTGCTC']
phyche_index = [[1.019, -0.918, 0.488, 0.567, 0.567, -0.070, -0.579, 0.488, -0.654, -2.455,-0.070, -0.918, 1.603, -0.654, 0.567, 1.019]]
print (NormalizeIndex(phyche_index,is_convert_dict = False)[0])