Source code for fingerprint

# -*- coding: utf-8 -*-
#  Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
#  All rights reserved.
#  This file is part of the PyBioMed.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the PyBioMed source tree.
This module is to compute the various fingerprints  based on the provided 

fingerprint system. If you have any question please contact me via email.


@author: Zhijiang Yao and Dongsheng Cao

Email: and

from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem.AtomPairs import Pairs
from rdkit.Chem.AtomPairs import Torsions
from rdkit import DataStructs
from estate import CalculateEstateFingerprint as EstateFingerprint
import pybel
from rdkit.Chem import ChemicalFeatures
from rdkit.Chem.Pharm2D.SigFactory import SigFactory
from rdkit.Chem.Pharm2D import Generate
from ghosecrippen import GhoseCrippenFingerprint
from PubChemFingerprints import calcPubChemFingerAll

similaritymeasure=[i[0] for i in DataStructs.similarityFunctions]

[docs]def CalculateFP2Fingerprint(mol): """ ################################################################# Calculate FP2 fingerprints (1024 bits). Usage: result=CalculateFP2Fingerprint(mol) Input: mol is a molecule object. Output: result is a tuple form. The first is the number of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res={} NumFinger = 1024 temp = mol.calcfp().bits for i in temp: res.update({i:1}) return NumFinger,res
[docs]def CalculateFP3Fingerprint(mol): """ ################################################################# Calculate FP3 fingerprints (210 bits). Usage: result=CalculateFP3Fingerprint(mol) Input: mol is a molecule object. Output: result is a tuple form. The first is the number of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res={} NumFinger = 210 temp = mol.calcfp('FP3').bits for i in temp: res.update({i:1}) return NumFinger,res
[docs]def CalculateFP4Fingerprint(mol): """ ################################################################# Calculate FP4 fingerprints (307 bits). Usage: result=CalculateFP4Fingerprint(mol) Input: mol is a molecule object. Output: result is a tuple form. The first is the number of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res={} NumFinger=307 temp=mol.calcfp('FP4').bits for i in temp: res.update({i:1}) return NumFinger,res
[docs]def CalculateDaylightFingerprint(mol): """ ################################################################# Calculate Daylight-like fingerprint or topological fingerprint (2048 bits). Usage: result=CalculateDaylightFingerprint(mol) Input: mol is a molecule object. Output: result is a tuple form. The first is the number of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res={} NumFinger=2048 bv=FingerprintMols.FingerprintMol(mol) temp=tuple(bv.GetOnBits()) for i in temp: res.update({i:1}) return NumFinger,res,bv
[docs]def CalculateMACCSFingerprint(mol): """ ################################################################# Calculate MACCS keys (166 bits). Usage: result=CalculateMACCSFingerprint(mol) Input: mol is a molecule object. Output: result is a tuple form. The first is the number of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res={} NumFinger=166 bv=MACCSkeys.GenMACCSKeys(mol) temp=tuple(bv.GetOnBits()) for i in temp: res.update({i:1}) return NumFinger,res,bv
[docs]def CalculateEstateFingerprint(mol): """ ################################################################# Calculate E-state fingerprints (79 bits). Usage: result=CalculateEstateFingerprint(mol) Input: mol is a molecule object. Output: result is a tuple form. The first is the number of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ NumFinger=79 res={} temp=EstateFingerprint(mol) for i in temp: if temp[i]>0: res[i[7:]]=1 return NumFinger,res,temp
[docs]def CalculateAtomPairsFingerprint(mol): """ ################################################################# Calculate atom pairs fingerprints Usage: result=CalculateAtomPairsFingerprint(mol) Input: mol is a molecule object. Output: result is a tuple form. The first is the number of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res=Pairs.GetAtomPairFingerprint(mol) return res.GetLength(),res.GetNonzeroElements(),res
[docs]def CalculateTopologicalTorsionFingerprint(mol): """ ################################################################# Calculate Topological Torsion Fingerprints Usage: result=CalculateTopologicalTorsionFingerprint(mol) Input: mol is a molecule object. Output: result is a tuple form. The first is the number of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res=Torsions.GetTopologicalTorsionFingerprint(mol) return res.GetLength(),res.GetNonzeroElements(),res
[docs]def CalculateMorganFingerprint(mol,radius=2): """ ################################################################# Calculate Morgan Usage: result=CalculateMorganFingerprint(mol) Input: mol is a molecule object. radius is a radius. Output: result is a tuple form. The first is the number of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res=AllChem.GetMorganFingerprint(mol,radius) return res.GetLength(),res.GetNonzeroElements(),res
[docs]def CalculateECFP2Fingerprint(mol,radius=1): """ ################################################################# Calculate ECFP2 Usage: result=CalculateECFP2Fingerprint(mol) Input: mol is a molecule object. radius is a radius. Output: result is a tuple form. The first is the vector of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res=AllChem.GetMorganFingerprint(mol,radius) fp = tuple(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits = 1024)) return fp, res.GetNonzeroElements(), res
[docs]def CalculateECFP4Fingerprint(mol,radius=2): """ ################################################################# Calculate ECFP4 Usage: result=CalculateECFP4Fingerprint(mol) Input: mol is a molecule object. radius is a radius. Output: result is a tuple form. The first is the vector of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res=AllChem.GetMorganFingerprint(mol,radius) fp = tuple(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits = 1024)) return fp, res.GetNonzeroElements(), res
[docs]def CalculateECFP6Fingerprint(mol,radius=3): """ ################################################################# Calculate ECFP6 Usage: result=CalculateECFP6Fingerprint(mol) Input: mol is a molecule object. radius is a radius. Output: result is a tuple form. The first is the vector of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res=AllChem.GetMorganFingerprint(mol,radius) fp = tuple(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits = 1024)) return fp, res.GetNonzeroElements(), res
[docs]def CalculateSimilarityPybel(fp1,fp2): """ ################################################################# Calculate Tanimoto similarity between two molecules. Usage: result=CalculateSimilarityPybel(fp1,fp2) Input: fp1 and fp2 are two DataStructs. Output: result is a Tanimoto similarity value. ################################################################# """ intersection = set(fp1[1].keys())& set(fp2[1].keys()) union = set(fp1[1].keys()) | set(fp2[1].keys()) tanimoto = len(intersection) / float(len(union)) return round(tanimoto,3)
[docs]def CalculateSimilarityRdkit(fp1,fp2,similarity="Tanimoto"): """ ################################################################# Calculate similarity between two molecules. Usage: result=CalculateSimilarity(fp1,fp2) Users can choose 11 different types: Tanimoto, Dice, Cosine, Sokal, Russel, RogotGoldberg, AllBit, Kulczynski, McConnaughey, Asymmetric, BraunBlanquet Input: fp1 and fp2 are two DataStructs. Output: result is a similarity value. ################################################################# """ temp=DataStructs.similarityFunctions for i in temp: if similarity in i[0]: similarityfunction=i[1] else: similarityfunction=temp[0][1] res=similarityfunction(fp1,fp2) return round(res,3)
[docs]def CalculateFCFP2Fingerprint(mol, radius=1, nBits = 1024): """ ################################################################# Calculate FCFP2 Usage: result=CalculateFCFP2Fingerprint(mol) Input: mol is a molecule object. radius is a radius. Output: result is a tuple form. The first is the vector of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res = AllChem.GetMorganFingerprint(mol, radius, useFeatures = True) fp = tuple(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits, useFeatures = True)) return fp, res.GetNonzeroElements(), res
[docs]def CalculateFCFP4Fingerprint(mol,radius=2, nBits = 1024): """ ################################################################# Calculate FCFP4 Usage: result=CalculateFCFP4Fingerprint(mol) Input: mol is a molecule object. radius is a radius. Output: result is a tuple form. The first is the vector of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res = AllChem.GetMorganFingerprint(mol, radius, useFeatures = True) fp = tuple(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits, useFeatures = True)) return fp, res.GetNonzeroElements(), res
[docs]def CalculateFCFP6Fingerprint(mol,radius=3, nBits = 1024): """ ################################################################# Calculate FCFP6 Usage: result=CalculateFCFP4Fingerprint(mol) Input: mol is a molecule object. radius is a radius. Output: result is a tuple form. The first is the vector of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res = AllChem.GetMorganFingerprint(mol, radius, useFeatures = True) fp = tuple(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits, useFeatures = True)) return fp, res.GetNonzeroElements(),res
################################################################ fdefstr = ''' AtomType NDonor [N&!H0&v3,N&!H0&+1&v4,n&H1&+0] AtomType ChalcDonor [O,S;H1;+0] DefineFeature SingleAtomDonor [{NDonor},{ChalcDonor},!$([D1]-[C;D3]=[O,S,N])] Family Donor Weights 1 EndFeature AtomType NAcceptor [$([N&v3;H1,H2]-[!$(*=[O,N,P,S])])] Atomtype NAcceptor [$([N;v3;H0])] AtomType NAcceptor [$([n;+0])] AtomType ChalcAcceptor [$([O,S;H1;v2]-[!$(*=[O,N,P,S])])] AtomType ChalcAcceptor [O,S;H0;v2] Atomtype ChalcAcceptor [O,S;-] Atomtype ChalcAcceptor [o,s;+0] AtomType HalogenAcceptor [F] DefineFeature SingleAtomAcceptor [{NAcceptor},{ChalcAcceptor},{HalogenAcceptor}] Family Acceptor Weights 1 EndFeature # this one is delightfully easy: DefineFeature AcidicGroup [C,S](=[O,S,P])-[O;H1,H0&-1] Family NegIonizable Weights 1.0,1.0,1.0 EndFeature AtomType CarbonOrArom_NonCarbonyl [$([C,a]);!$([C,a](=O))] AtomType BasicNH2 [$([N;H2&+0][{CarbonOrArom_NonCarbonyl}])] AtomType BasicNH1 [$([N;H1&+0]([{CarbonOrArom_NonCarbonyl}])[{CarbonOrArom_NonCarbonyl}])] AtomType BasicNH0 [$([N;H0&+0]([{CarbonOrArom_NonCarbonyl}])([{CarbonOrArom_NonCarbonyl}])[{CarbonOrArom_NonCarbonyl}])] AtomType BasicNakedN [N,n;X2;+0] DefineFeature BasicGroup [{BasicNH2},{BasicNH1},{BasicNH0},{BasicNakedN}] Family PosIonizable Weights 1.0 EndFeature # aromatic rings of various sizes: DefineFeature Arom5 a1aaaa1 Family Aromatic Weights 1.0,1.0,1.0,1.0,1.0 EndFeature DefineFeature Arom6 a1aaaaa1 Family Aromatic Weights 1.0,1.0,1.0,1.0,1.0,1.0 EndFeature DefineFeature Arom7 a1aaaaaa1 Family Aromatic Weights 1.0,1.0,1.0,1.0,1.0,1.0,1.0 EndFeature DefineFeature Arom8 a1aaaaaaa1 Family Aromatic Weights 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 EndFeature ''' featFactory = ChemicalFeatures.BuildFeatureFactoryFromString(fdefstr)
[docs]def CalculatePharm2D2pointFingerprint(mol, featFactory = featFactory): """ Calculate Pharm2D2point Fingerprints """ sigFactory_2point = SigFactory(featFactory,minPointCount=2,maxPointCount=2) sigFactory_2point.SetBins([(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9)]) sigFactory_2point.Init() res = Generate.Gen2DFingerprint(mol,sigFactory_2point) res_keys = tuple(res.GetOnBits()) init_list = [0]*135 for res_key in res_keys: init_list[res_key] = 1 BitVect = tuple(init_list) return BitVect, res_keys, res
[docs]def CalculatePharm2D3pointFingerprint(mol, featFactory = featFactory): """ Calculate Pharm2D3point Fingerprints """ sigFactory_3point = SigFactory(featFactory,minPointCount=3,maxPointCount=3) sigFactory_3point.SetBins([(0, 2), (2,4), (4,6), (6,10)]) sigFactory_3point.Init() res = Generate.Gen2DFingerprint(mol,sigFactory_3point) res_keys = tuple(res.GetOnBits()) init_list = [0]*2135 for res_key in res_keys: init_list[res_key] = 1 BitVect = tuple(init_list) return BitVect, res_keys, res
[docs]def CalculateGhoseCrippenFingerprint(mol, count = False): """ Calculate GhoseCrippen Fingerprints """ res = GhoseCrippenFingerprint(mol, count=count) return res
[docs]def CalculatePubChemFingerprint(mol): """ Calculate PubChem Fingerprints """ res = calcPubChemFingerAll(mol) return res
_FingerprintFuncs={'FP2':CalculateFP2Fingerprint, 'FP3':CalculateFP3Fingerprint, 'FP4':CalculateFP4Fingerprint, 'topological':CalculateDaylightFingerprint, 'Estate':CalculateEstateFingerprint, 'atompairs':CalculateAtomPairsFingerprint, 'torsions':CalculateTopologicalTorsionFingerprint, 'morgan':CalculateMorganFingerprint, 'ECFP2':CalculateECFP2Fingerprint, 'ECFP4':CalculateECFP4Fingerprint, 'ECFP6':CalculateECFP6Fingerprint, 'MACCS':CalculateMACCSFingerprint, 'FCFP2':CalculateFCFP2Fingerprint, 'FCFP4':CalculateFCFP4Fingerprint, 'FCFP6':CalculateFCFP6Fingerprint, 'Pharm2D2point':CalculatePharm2D2pointFingerprint, 'Pharm2D3point':CalculatePharm2D3pointFingerprint, 'PubChem': CalculatePubChemFingerprint, 'GhoseCrippen': CalculateGhoseCrippenFingerprint} ################################################################ if __name__=="__main__": print '-'*10+'START'+'-'*10 ms = [Chem.MolFromSmiles('CCOC=N'), Chem.MolFromSmiles('NC1=NC(=CC=N1)N1C=CC2=C1C=C(O)C=C2')] m2 = [pybel.readstring("smi",'CCOC=N'),pybel.readstring("smi",'CCO')] res1=CalculateECFP4Fingerprint(ms[0]) print res1 print '-'*25 res2=CalculateECFP4Fingerprint(ms[1]) print res2 print '-'*25 mol = pybel.readstring("smi", 'CCOC=N') res3 = CalculateFP3Fingerprint(mol) print res3 print '-'*25 mol = Chem.MolFromSmiles('O=C1NC(=O)NC(=O)C1(C(C)C)CC=C') res4 = CalculatePharm2D2pointFingerprint(mol)[0] print res4 print '-'*25 res5 = CalculatePharm2D3pointFingerprint(mol)[0] print res5 print '-'*25 res6 = CalculateGhoseCrippenFingerprint(mol) print res6 print '-'*25 res7 = CalculatePubChemFingerprint(mol) print res7 print '-'*10+'END'+'-'*10