Source code for PyPretreatDNA

# -*- coding: utf-8 -*-
#  Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
#  All rights reserved.
#  This file is part of the PyBioMed.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the PyBioMed source tree.
"""
Created on Wed May 18 14:06:37 2016

@author: yzj
"""


import sys

ALPHABET = 'ACGT'


"""Used for process original data."""


[docs]class Seq:
    def __init__(self, name, seq, no):
        self.name = name
        self.seq = seq.upper()
        self.no = no
        self.length = len(seq)

    def __str__(self):
        """Output seq when 'print' method is called."""
        return "%s\tNo:%s\tlength:%s\n%s" % (self.name, str(self.no), str(self.length), self.seq)


[docs]def IsUnderAlphabet(s, alphabet):
    """
    #################################################################
    Judge the string is within the scope of the alphabet or not.

    :param s: The string.
    :param alphabet: alphabet.

    Return True or the error character.
    #################################################################
    """
    for e in s:
        if e not in alphabet:
            return e

    return True


[docs]def IsFasta(seq):
    """
    #################################################################
    Judge the Seq object is in FASTA format.
    Two situation:
    1. No seq name.
    2. Seq name is illegal.
    3. No sequence.

    :param seq: Seq object.
    #################################################################
    """
    if not seq.name:
        error_info = 'Error, sequence ' + str(seq.no) + ' has no sequence name.'
        print(seq)
        sys.stderr.write(error_info)
        return False
    if -1 != seq.name.find('>'):
        error_info = 'Error, sequence ' + str(seq.no) + ' name has > character.'
        sys.stderr.write(error_info)
        return False
    if 0 == seq.length:
        error_info = 'Error, sequence ' + str(seq.no) + ' is null.'
        sys.stderr.write(error_info)
        return False

    return True


[docs]def ReadFasta(f):
    """
    #################################################################
    Read a fasta file.

    :param f: HANDLE to input. e.g. sys.stdin, or open(<file>)

    Return Seq obj list.
    #################################################################
    """
    name, seq = '', ''
    count = 0
    seq_list = []
    lines = f.readlines()
    for line in lines:
        if not line:
            break

        if '>' == line[0]:
            if 0 != count or (0 == count and seq != ''):
                if IsFasta(Seq(name, seq, count)):
                    seq_list.append(Seq(name, seq, count))
                else:
                    sys.exit(0)

            seq = ''
            name = line[1:].strip()
            count += 1
        else:
            seq += line.strip()

    count += 1
    if IsFasta(Seq(name, seq, count)):
        seq_list.append(Seq(name, seq, count))
    else:
        sys.exit(0)

    return seq_list


[docs]def ReadFastaYield(f):
    """
    #################################################################
    Yields a Seq object.

    :param f: HANDLE to input. e.g. sys.stdin, or open(<file>)
    #################################################################
    """
    name, seq = '', ''
    count = 0
    while True:
        line = f.readline()
        if not line:
            break

        if '>' == line[0]:
            if 0 != count or (0 == count and seq != ''):
                if IsFasta(Seq(name, seq, count)):
                    yield Seq(name, seq, count)
                else:
                    sys.exit(0)

            seq = ''
            name = line[1:].strip()
            count += 1
        else:
            seq += line.strip()

    if IsFasta(Seq(name, seq, count)):
        yield Seq(name, seq, count)
    else:
        sys.exit(0)


[docs]def ReadFastaCheckDna(f):
    """
    #################################################################
    Read the fasta file, and check its legality.

    :param f: HANDLE to input. e.g. sys.stdin, or open(<file>)

    Return the seq list.
    #################################################################
    """
    seq_list = []
    for e in ReadFastaYield(f):
        # print e
        res = IsUnderAlphabet(e.seq, ALPHABET)
        if res:
            seq_list.append(e)
        else:
            error_info = 'Sorry, sequence ' + str(e.no) \
                         + ' has character ' + str(res) + '.(The character must be A or C or G or T)'
            sys.stderr(error_info)
            sys.exit(0)

    return seq_list


[docs]def GetSequenceCheckDna(f):
    """
    #################################################################
    Read the fasta file.

    Input: f: HANDLE to input. e.g. sys.stdin, or open(<file>)

    Return the sequence list.
    #################################################################
    """
    sequence_list = []
    for e in ReadFastaYield(f):
        # print e
        res = IsUnderAlphabet(e.seq, ALPHABET)
        if res is not True:
            error_info = 'Sorry, sequence ' + str(e.no) \
                         + ' has character ' + str(res) + '.(The character must be A, C, G or T)'
            sys.stderr.write(error_info)
            sys.exit(0)
        else:
            sequence_list.append(e.seq)

    return sequence_list


[docs]def IsSequenceList(sequence_list):
    """
    #################################################################
    Judge the sequence list is within the scope of alphabet and 
    change the lowercase to capital.
    #################################################################
    """
    count = 0
    new_sequence_list = []

    for e in sequence_list:
        e = e.upper()
        count += 1
        res = IsUnderAlphabet(e, ALPHABET)
        if res is not True:
            error_info = 'Sorry, sequence ' + str(count) \
                         + ' has illegal character ' + str(res) + '.(The character must be A, C, G or T)'
            sys.stderr.write(error_info)
            return False
        else:
            new_sequence_list.append(e)

    return new_sequence_list


[docs]def GetData(input_data, desc=False):
    """
    #################################################################
    Get sequence data from file or list with check.

    :param input_data: type file or list
    :param desc: with this option, the return value will be a Seq object list(it only works in file object).
    :return: sequence data or shutdown.
    #################################################################
    """
    if hasattr(input_data, 'read'):
        if desc is False:
            return GetSequenceCheckDna(input_data)
        else:
            return ReadFastaCheckDna(input_data)
    elif isinstance(input_data, list):
        input_data = IsSequenceList(input_data)
        if input_data is not False:
            return input_data
        else:
            sys.exit(0)
    else:
        error_info = 'Sorry, the parameter in get_data method must be list or file type.'
        sys.stderr.write(error_info)
        sys.exit(0)


"""Some basic function for generate feature vector."""


[docs]def Frequency(tol_str, tar_str):
    """
    #################################################################
    Generate the frequency of tar_str in tol_str.

    :param tol_str: mother string.
    :param tar_str: substring.
    #################################################################
    """
    i, j, tar_count = 0, 0, 0
    len_tol_str = len(tol_str)
    len_tar_str = len(tar_str)
    while i < len_tol_str and j < len_tar_str:
        if tol_str[i] == tar_str[j]:
            i += 1
            j += 1
            if j >= len_tar_str:
                tar_count += 1
                i = i - j + 1
                j = 0
        else:
            i = i - j + 1
            j = 0

    return tar_count


[docs]def WriteLibsvm(vector_list, label_list, write_file):
    """
    #################################################################
    Write the vector into disk in libSVM format.
    #################################################################
    """
    len_vector_list = len(vector_list)
    len_label_list = len(label_list)
    if len_vector_list == 0:
        sys.stderr.write("The vector is none.")
        sys.exit(1)
    if len_label_list == 0:
        sys.stderr.write("The label is none.")
        sys.exit(1)
    if len_vector_list != len_label_list:
        sys.stderr.write("The length of vector and label is different.")
        sys.exit(1)

    with open(write_file, 'w') as f:
        len_vector = len(vector_list[0])
        for i in range(len_vector_list):
            temp_write = str(label_list[i])
            for j in range(0, len_vector):
                temp_write += ' ' + str(j + 1) + ':' + str(vector_list[i][j])
            f.write(temp_write)
            f.write('\n')


[docs]def GeneratePhycheValue(k, phyche_index=None, all_property=False, extra_phyche_index=None):
    """
    #################################################################
    Combine the user selected phyche_list, is_all_property and 
    extra_phyche_index to a new standard phyche_value.
    #################################################################
    """
    if phyche_index is None:
        phyche_index = []
    if extra_phyche_index is None:
        extra_phyche_index = {}

    diphyche_list = ['Base stacking', 'Protein induced deformability', 'B-DNA twist', 'Dinucleotide GC Content',
                     'A-philicity', 'Propeller twist', 'Duplex stability:(freeenergy)',
                     'Duplex tability(disruptenergy)', 'DNA denaturation', 'Bending stiffness', 'Protein DNA twist',
                     'Stabilising energy of Z-DNA', 'Aida_BA_transition', 'Breslauer_dG', 'Breslauer_dH',
                     'Breslauer_dS', 'Electron_interaction', 'Hartman_trans_free_energy', 'Helix-Coil_transition',
                     'Ivanov_BA_transition', 'Lisser_BZ_transition', 'Polar_interaction', 'SantaLucia_dG',
                     'SantaLucia_dH', 'SantaLucia_dS', 'Sarai_flexibility', 'Stability', 'Stacking_energy',
                     'Sugimoto_dG', 'Sugimoto_dH', 'Sugimoto_dS', 'Watson-Crick_interaction', 'Twist', 'Tilt',
                     'Roll', 'Shift', 'Slide', 'Rise']
    triphyche_list = ['Dnase I', 'Bendability (DNAse)', 'Bendability (consensus)', 'Trinucleotide GC Content',
                      'Nucleosome positioning', 'Consensus_roll', 'Consensus-Rigid', 'Dnase I-Rigid', 'MW-Daltons',
                      'MW-kg', 'Nucleosome', 'Nucleosome-Rigid']

    # Set and check physicochemical properties.
    if 2 == k:
        if all_property is True:
            phyche_index = diphyche_list
        else:
            for e in phyche_index:
                if e not in diphyche_list:
                    error_info = 'Sorry, the physicochemical properties ' + e + ' is not exit.'
                    import sys

                    sys.stderr.write(error_info)
                    sys.exit(0)
    elif 3 == k:
        if all_property is True:
            phyche_index = triphyche_list
        else:
            for e in phyche_index:
                if e not in triphyche_list:
                    error_info = 'Sorry, the physicochemical properties ' + e + ' is not exit.'
                    import sys

                    sys.stderr.write(error_info)
                    sys.exit(0)

    # Generate phyche_value.
    from PyBioMed.PyDNA.PyDNApsenacutil import GetPhycheIndex, ExtendPhycheIndex

    return ExtendPhycheIndex(GetPhycheIndex(k, phyche_index), extra_phyche_index)


[docs]def ConvertPhycheIndexToDict(phyche_index):
    """
    #################################################################
    Convert phyche index from list to dict.
    #################################################################
    """
    # for e in phyche_index:
    #     print e
    len_index_value = len(phyche_index[0])
    k = 0
    for i in range(1, 10):
        if len_index_value < 4**i:
            error_infor = 'Sorry, the number of each index value is must be 4^k.'
            sys.stdout.write(error_infor)
            sys.exit(0)
        if len_index_value == 4**i:
            k = i
            break
    from PyBioMed.PyDNA.PyDNAnacutil import MakeKmerList
    kmer_list = MakeKmerList(k, ALPHABET)
    # print kmer_list
    len_kmer = len(kmer_list)
    phyche_index_dict = {}
    for kmer in kmer_list:
        phyche_index_dict[kmer] = []
    # print phyche_index_dict
    phyche_index = list(zip(*phyche_index))
    for i in range(len_kmer):
        phyche_index_dict[kmer_list[i]] = list(phyche_index[i])

    return phyche_index_dict


[docs]def StandardDeviation(value_list):
    """
    #################################################################
    Return standard deviation.
    #################################################################
    """
    from math import sqrt
    from math import pow
    n = len(value_list)
    average_value = sum(value_list) * 1.0 / n
    return sqrt(sum([pow(e - average_value, 2) for e in value_list]) * 1.0 / (n-1))


[docs]def NormalizeIndex(phyche_index, is_convert_dict=False):
    """
    #################################################################
    Normalize the physicochemical index.
    #################################################################
    """
    normalize_phyche_value = []
    for phyche_value in phyche_index:
        average_phyche_value = sum(phyche_value) * 1.0 / len(phyche_value)
        sd_phyche = StandardDeviation(phyche_value)
        normalize_phyche_value.append([round((e - average_phyche_value) / sd_phyche, 2) for e in phyche_value])

    if is_convert_dict is True:
        return ConvertPhycheIndexToDict(normalize_phyche_value)

    return normalize_phyche_value


[docs]def DNAChecks(s):
    for e in s:
        if e not in ALPHABET:
            return e
    return True

    
if __name__ == "__main__":
    re = ['GACTGAACTGCACTTTGGTTTCATATTATTTGCTC']


    phyche_index = [[1.019, -0.918, 0.488, 0.567, 0.567, -0.070, -0.579, 0.488, -0.654, -2.455,-0.070, -0.918, 1.603, -0.654, 0.567, 1.019]]
    print (NormalizeIndex(phyche_index,is_convert_dict = False)[0])