# -*- coding: utf-8 -*-
# Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
# All rights reserved.
# This file is part of the PyBioMed.
# The contents are covered by the terms of the BSD license
# which is included in the file license.txt, found at the root
# of the PyBioMed source tree.
"""
##############################################################################
A class used for computing different types of DNA descriptors!
You can freely use and distribute it. If you have any problem,
you could contact with us timely.
Authors: Zhijiang Yao and Dongsheng Cao.
Date: 2016.10.11
Email: gadsby@163.com and oriental-cds@163.com
##############################################################################
"""
from PyDNAnacutil import MakeUptoKmerList, MakeRevcompKmerList, MakeKmerVector
from PyDNAutil import GetData
[docs]def CheckNacPara(k, normalize=False, upto=False, alphabet='ACGT'):
"""
###########################################################################
Check the nac parameter's validation.
###########################################################################
"""
try:
if not isinstance(k, int) or k <= 0:
raise ValueError("Error, parameter k must be an integer and larger than 0.")
elif not isinstance(normalize, bool):
raise ValueError("Error, parameter normalize must be bool type.")
elif not isinstance(upto, bool):
raise ValueError("Error, parameter upto must be bool type.")
elif alphabet != 'ACGT':
raise ValueError("Error, parameter alphabet must be 'ACGT'.")
except ValueError:
raise
[docs]def GetKmerList(k, upto, alphabet):
"""
###########################################################################
Get the kmer list.
:param k: int, the k value of kmer, it should be larger than 0.
:param upto: bool, whether to generate 1-kmer, 2-kmer, ..., k-mer.
:param alphabet: string.
###########################################################################
"""
if upto:
k_list = list(range(1, k + 1))
else:
k_list = list(range(k, k + 1))
kmer_list = MakeUptoKmerList(k_list, alphabet)
return kmer_list
[docs]def GetKmer(data, **kwargs):
"""
###########################################################################
Make a kmer dictionary with options k, upto, revcomp, normalize.
:param k: int, the k value of kmer, it should be larger than 0.
:param normalize: bool, normalize the result vector or not.
:param upto: bool, whether to generate 1-kmer, 2-kmer, ..., k-mer.
:param alphabet: string.
:param data: file object or sequence list.
:return: kmer vector.
###########################################################################
"""
if 'k' in kwargs:
k = kwargs['k']
else:
k = 1
if 'normalize' in kwargs:
normalize = kwargs['normalize']
else:
normalize = False
if 'upto' in kwargs:
upto =kwargs['upto']
else:
upto = False
if 'alphabet' in kwargs:
alphabet = kwargs['alphabet']
else:
alphabet = "ACGT"
data = [data]
sequence_list = GetData(data)
kmer_list = GetKmerList(k, upto, alphabet)
rev_kmer_list = []
revcomp = False
vec = MakeKmerVector(sequence_list, kmer_list, rev_kmer_list, k, upto, revcomp, normalize)
dict_keys = ['Kmer_%s'%i for i in range(1,len(vec[0])+1)]
res = dict(zip(dict_keys,vec[0]))
return res
[docs]def GetRevcKmer(data, **kwargs):
"""
###########################################################################
Make a reverse compliment kmer dictionary with options k, upto, normalize.
:param data: file object or sequence list.
:return: reverse compliment kmer vector.
###########################################################################
"""
if 'k' in kwargs:
k = kwargs['k']
else:
k = 1
if 'normalize' in kwargs:
normalize = kwargs['normalize']
else:
normalize = False
if 'upto' in kwargs:
upto =kwargs['upto']
else:
upto = False
if 'alphabet' in kwargs:
alphabet = kwargs['alphabet']
else:
alphabet = "ACGT"
data = [data]
sequence_list = GetData(data)
kmer_list = GetKmerList(k, upto, alphabet)
# Use lexicographically first version of {kmer, revcomp(kmer)}.
rev_kmer_list = MakeRevcompKmerList(kmer_list)
revcomp = True
vec = MakeKmerVector(sequence_list, kmer_list, rev_kmer_list, k, upto, revcomp, normalize)
dict_keys = ['RevcKmer_%s'%i for i in range(1,len(vec[0])+1)]
res = dict(zip(dict_keys,vec[0]))
return res
[docs]def GetIdKmer(data, hs, non_hs,**kwargs):
"""
###########################################################################
Make IDKmer vector.
:param data: Need to processed FASTA file.
:param hs: Positive FASTA file.
:param non_hs: Negative FASTA file.
:param k: int, the k value of kmer, it should be larger than 0.
:param upto: bool, whether to generate 1-kmer, 2-kmer, ..., k-mer.
:param alphabet: string.
###########################################################################
"""
if 'k' in kwargs:
k = kwargs['k']
else:
k = 6
if 'upto' in kwargs:
upto =kwargs['upto']
else:
upto = True
if 'alphabet' in kwargs:
alphabet = kwargs['alphabet']
else:
alphabet = "ACGT"
from PyDNAnacutil import MakeKmerList
from PyDNAnacutil import Diversity
from PyDNAnacutil import IdXS
rev_kmer_list, upto, revcomp, normalize = [], False, False, False
pos_s_list = GetData(hs)
neg_s_list = GetData(non_hs)
# print k
if upto is False:
k_list = [k]
else:
k_list = list(range(1, k+1))
# print 'k_list =', k_list
# Get all kmer ID from 1-kmer to 6-kmer.
# Calculate standard source S vector.
pos_s_vec, neg_s_vec = [], []
diversity_pos_s, diversity_neg_s = [], []
for k in k_list:
kmer_list = MakeKmerList(k, alphabet)
temp_pos_s_vec = MakeKmerVector(pos_s_list, kmer_list, rev_kmer_list, k, upto, revcomp, normalize)
temp_neg_s_vec = MakeKmerVector(neg_s_list, kmer_list, rev_kmer_list, k, upto, revcomp, normalize)
temp_pos_s_vec = [sum(e) for e in zip(*[e for e in temp_pos_s_vec])]
temp_neg_s_vec = [sum(e) for e in zip(*[e for e in temp_neg_s_vec])]
pos_s_vec.append(temp_pos_s_vec)
neg_s_vec.append(temp_neg_s_vec)
diversity_pos_s.append(Diversity(temp_pos_s_vec))
diversity_neg_s.append(Diversity(temp_neg_s_vec))
# Calculate Diversity(X) and ID(X, S).
sequence_list = GetData(data)
vec = []
for seq in sequence_list:
# print seq
temp_vec = []
for k in k_list:
kmer_list = MakeKmerList(k, alphabet)
seq_list = [seq]
kmer_vec = MakeKmerVector(seq_list, kmer_list, rev_kmer_list, k, upto, revcomp, normalize)
# print 'k', k
# print 'kmer_vec', kmer_vec
# print diversity_pos_s
if upto is False:
k = 1
# print 'pos_vec', pos_s_vec
# print 'neg_vec', neg_s_vec
# print 'diversity_pos_s', diversity_pos_s
temp_vec.append(round(IdXS(kmer_vec[0], pos_s_vec[k-1], diversity_pos_s[k-1]), 3))
temp_vec.append(round(IdXS(kmer_vec[0], neg_s_vec[k-1], diversity_neg_s[k-1]), 3))
vec.append(temp_vec)
return vec
if __name__ == '__main__':
# kmer =Kmer(k=1)
# kmer =RevcKmer(k=1, normalize=True, alphabet='ACGT')
# kmer =IDkmer(k=1)
kmer = GetKmer('GACTGAACTGCACTTTGGTTTCATATTATTTGCTC',k=2)
print(kmer)
kmer = GetKmer('GACTGAACTGCACTTTGGTTTCATATTATTTGCTC',k=2,normalize=True)
print(kmer)
kmer = GetKmer('GACTGAACTGCACTTTGGTTTCATATTATTTGCTC',k=2,normalize=True,upto=True)
print(kmer)
revckmer = GetRevcKmer('GACTGAACTGCACTTTGGTTTCATATTATTTGCTC',k=2, normalize=False, upto=False)
print(revckmer)
revckmer = GetRevcKmer('GACTGAACTGCACTTTGGTTTCATATTATTTGCTC',k=2,normalize=True, upto=True)
print(revckmer)
print('\n')