Source code for PyPretreatDNA

# -*- coding: utf-8 -*-
#  Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
#  All rights reserved.
#  This file is part of the PyBioMed.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the PyBioMed source tree.
"""
Created on Wed May 18 14:06:37 2016

@author: yzj
"""


import sys

ALPHABET = 'ACGT'


"""Used for process original data."""


[docs]class Seq: def __init__(self, name, seq, no): self.name = name self.seq = seq.upper() self.no = no self.length = len(seq) def __str__(self): """Output seq when 'print' method is called.""" return "%s\tNo:%s\tlength:%s\n%s" % (self.name, str(self.no), str(self.length), self.seq)
[docs]def IsUnderAlphabet(s, alphabet): """ ################################################################# Judge the string is within the scope of the alphabet or not. :param s: The string. :param alphabet: alphabet. Return True or the error character. ################################################################# """ for e in s: if e not in alphabet: return e return True
[docs]def IsFasta(seq): """ ################################################################# Judge the Seq object is in FASTA format. Two situation: 1. No seq name. 2. Seq name is illegal. 3. No sequence. :param seq: Seq object. ################################################################# """ if not seq.name: error_info = 'Error, sequence ' + str(seq.no) + ' has no sequence name.' print(seq) sys.stderr.write(error_info) return False if -1 != seq.name.find('>'): error_info = 'Error, sequence ' + str(seq.no) + ' name has > character.' sys.stderr.write(error_info) return False if 0 == seq.length: error_info = 'Error, sequence ' + str(seq.no) + ' is null.' sys.stderr.write(error_info) return False return True
[docs]def ReadFasta(f): """ ################################################################# Read a fasta file. :param f: HANDLE to input. e.g. sys.stdin, or open(<file>) Return Seq obj list. ################################################################# """ name, seq = '', '' count = 0 seq_list = [] lines = f.readlines() for line in lines: if not line: break if '>' == line[0]: if 0 != count or (0 == count and seq != ''): if IsFasta(Seq(name, seq, count)): seq_list.append(Seq(name, seq, count)) else: sys.exit(0) seq = '' name = line[1:].strip() count += 1 else: seq += line.strip() count += 1 if IsFasta(Seq(name, seq, count)): seq_list.append(Seq(name, seq, count)) else: sys.exit(0) return seq_list
[docs]def ReadFastaYield(f): """ ################################################################# Yields a Seq object. :param f: HANDLE to input. e.g. sys.stdin, or open(<file>) ################################################################# """ name, seq = '', '' count = 0 while True: line = f.readline() if not line: break if '>' == line[0]: if 0 != count or (0 == count and seq != ''): if IsFasta(Seq(name, seq, count)): yield Seq(name, seq, count) else: sys.exit(0) seq = '' name = line[1:].strip() count += 1 else: seq += line.strip() if IsFasta(Seq(name, seq, count)): yield Seq(name, seq, count) else: sys.exit(0)
[docs]def ReadFastaCheckDna(f): """ ################################################################# Read the fasta file, and check its legality. :param f: HANDLE to input. e.g. sys.stdin, or open(<file>) Return the seq list. ################################################################# """ seq_list = [] for e in ReadFastaYield(f): # print e res = IsUnderAlphabet(e.seq, ALPHABET) if res: seq_list.append(e) else: error_info = 'Sorry, sequence ' + str(e.no) \ + ' has character ' + str(res) + '.(The character must be A or C or G or T)' sys.stderr(error_info) sys.exit(0) return seq_list
[docs]def GetSequenceCheckDna(f): """ ################################################################# Read the fasta file. Input: f: HANDLE to input. e.g. sys.stdin, or open(<file>) Return the sequence list. ################################################################# """ sequence_list = [] for e in ReadFastaYield(f): # print e res = IsUnderAlphabet(e.seq, ALPHABET) if res is not True: error_info = 'Sorry, sequence ' + str(e.no) \ + ' has character ' + str(res) + '.(The character must be A, C, G or T)' sys.stderr.write(error_info) sys.exit(0) else: sequence_list.append(e.seq) return sequence_list
[docs]def IsSequenceList(sequence_list): """ ################################################################# Judge the sequence list is within the scope of alphabet and change the lowercase to capital. ################################################################# """ count = 0 new_sequence_list = [] for e in sequence_list: e = e.upper() count += 1 res = IsUnderAlphabet(e, ALPHABET) if res is not True: error_info = 'Sorry, sequence ' + str(count) \ + ' has illegal character ' + str(res) + '.(The character must be A, C, G or T)' sys.stderr.write(error_info) return False else: new_sequence_list.append(e) return new_sequence_list
[docs]def GetData(input_data, desc=False): """ ################################################################# Get sequence data from file or list with check. :param input_data: type file or list :param desc: with this option, the return value will be a Seq object list(it only works in file object). :return: sequence data or shutdown. ################################################################# """ if hasattr(input_data, 'read'): if desc is False: return GetSequenceCheckDna(input_data) else: return ReadFastaCheckDna(input_data) elif isinstance(input_data, list): input_data = IsSequenceList(input_data) if input_data is not False: return input_data else: sys.exit(0) else: error_info = 'Sorry, the parameter in get_data method must be list or file type.' sys.stderr.write(error_info) sys.exit(0)
"""Some basic function for generate feature vector."""
[docs]def Frequency(tol_str, tar_str): """ ################################################################# Generate the frequency of tar_str in tol_str. :param tol_str: mother string. :param tar_str: substring. ################################################################# """ i, j, tar_count = 0, 0, 0 len_tol_str = len(tol_str) len_tar_str = len(tar_str) while i < len_tol_str and j < len_tar_str: if tol_str[i] == tar_str[j]: i += 1 j += 1 if j >= len_tar_str: tar_count += 1 i = i - j + 1 j = 0 else: i = i - j + 1 j = 0 return tar_count
[docs]def WriteLibsvm(vector_list, label_list, write_file): """ ################################################################# Write the vector into disk in libSVM format. ################################################################# """ len_vector_list = len(vector_list) len_label_list = len(label_list) if len_vector_list == 0: sys.stderr.write("The vector is none.") sys.exit(1) if len_label_list == 0: sys.stderr.write("The label is none.") sys.exit(1) if len_vector_list != len_label_list: sys.stderr.write("The length of vector and label is different.") sys.exit(1) with open(write_file, 'w') as f: len_vector = len(vector_list[0]) for i in range(len_vector_list): temp_write = str(label_list[i]) for j in range(0, len_vector): temp_write += ' ' + str(j + 1) + ':' + str(vector_list[i][j]) f.write(temp_write) f.write('\n')
[docs]def GeneratePhycheValue(k, phyche_index=None, all_property=False, extra_phyche_index=None): """ ################################################################# Combine the user selected phyche_list, is_all_property and extra_phyche_index to a new standard phyche_value. ################################################################# """ if phyche_index is None: phyche_index = [] if extra_phyche_index is None: extra_phyche_index = {} diphyche_list = ['Base stacking', 'Protein induced deformability', 'B-DNA twist', 'Dinucleotide GC Content', 'A-philicity', 'Propeller twist', 'Duplex stability:(freeenergy)', 'Duplex tability(disruptenergy)', 'DNA denaturation', 'Bending stiffness', 'Protein DNA twist', 'Stabilising energy of Z-DNA', 'Aida_BA_transition', 'Breslauer_dG', 'Breslauer_dH', 'Breslauer_dS', 'Electron_interaction', 'Hartman_trans_free_energy', 'Helix-Coil_transition', 'Ivanov_BA_transition', 'Lisser_BZ_transition', 'Polar_interaction', 'SantaLucia_dG', 'SantaLucia_dH', 'SantaLucia_dS', 'Sarai_flexibility', 'Stability', 'Stacking_energy', 'Sugimoto_dG', 'Sugimoto_dH', 'Sugimoto_dS', 'Watson-Crick_interaction', 'Twist', 'Tilt', 'Roll', 'Shift', 'Slide', 'Rise'] triphyche_list = ['Dnase I', 'Bendability (DNAse)', 'Bendability (consensus)', 'Trinucleotide GC Content', 'Nucleosome positioning', 'Consensus_roll', 'Consensus-Rigid', 'Dnase I-Rigid', 'MW-Daltons', 'MW-kg', 'Nucleosome', 'Nucleosome-Rigid'] # Set and check physicochemical properties. if 2 == k: if all_property is True: phyche_index = diphyche_list else: for e in phyche_index: if e not in diphyche_list: error_info = 'Sorry, the physicochemical properties ' + e + ' is not exit.' import sys sys.stderr.write(error_info) sys.exit(0) elif 3 == k: if all_property is True: phyche_index = triphyche_list else: for e in phyche_index: if e not in triphyche_list: error_info = 'Sorry, the physicochemical properties ' + e + ' is not exit.' import sys sys.stderr.write(error_info) sys.exit(0) # Generate phyche_value. from PyBioMed.PyDNA.PyDNApsenacutil import GetPhycheIndex, ExtendPhycheIndex return ExtendPhycheIndex(GetPhycheIndex(k, phyche_index), extra_phyche_index)
[docs]def ConvertPhycheIndexToDict(phyche_index): """ ################################################################# Convert phyche index from list to dict. ################################################################# """ # for e in phyche_index: # print e len_index_value = len(phyche_index[0]) k = 0 for i in range(1, 10): if len_index_value < 4**i: error_infor = 'Sorry, the number of each index value is must be 4^k.' sys.stdout.write(error_infor) sys.exit(0) if len_index_value == 4**i: k = i break from PyBioMed.PyDNA.PyDNAnacutil import MakeKmerList kmer_list = MakeKmerList(k, ALPHABET) # print kmer_list len_kmer = len(kmer_list) phyche_index_dict = {} for kmer in kmer_list: phyche_index_dict[kmer] = [] # print phyche_index_dict phyche_index = list(zip(*phyche_index)) for i in range(len_kmer): phyche_index_dict[kmer_list[i]] = list(phyche_index[i]) return phyche_index_dict
[docs]def StandardDeviation(value_list): """ ################################################################# Return standard deviation. ################################################################# """ from math import sqrt from math import pow n = len(value_list) average_value = sum(value_list) * 1.0 / n return sqrt(sum([pow(e - average_value, 2) for e in value_list]) * 1.0 / (n-1))
[docs]def NormalizeIndex(phyche_index, is_convert_dict=False): """ ################################################################# Normalize the physicochemical index. ################################################################# """ normalize_phyche_value = [] for phyche_value in phyche_index: average_phyche_value = sum(phyche_value) * 1.0 / len(phyche_value) sd_phyche = StandardDeviation(phyche_value) normalize_phyche_value.append([round((e - average_phyche_value) / sd_phyche, 2) for e in phyche_value]) if is_convert_dict is True: return ConvertPhycheIndexToDict(normalize_phyche_value) return normalize_phyche_value
[docs]def DNAChecks(s): for e in s: if e not in ALPHABET: return e return True
if __name__ == "__main__": re = ['GACTGAACTGCACTTTGGTTTCATATTATTTGCTC'] phyche_index = [[1.019, -0.918, 0.488, 0.567, 0.567, -0.070, -0.579, 0.488, -0.654, -2.455,-0.070, -0.918, 1.603, -0.654, 0.567, 1.019]] print (NormalizeIndex(phyche_index,is_convert_dict = False)[0])