Source code for ghosecrippen

# -*- coding: utf-8 -*-
#  Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
#  All rights reserved.
#  This file is part of the PyBioMed.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the PyBioMed source tree.
"""

This module is to calculate the ghosecrippen descriptor. If you

have any question please contact me via email.

Authors: Zhijiang Yao and Dongsheng Cao.

Date: 2016.06.04

Email: gadsby@163.com and oriental-cds@163.com

"""
import string
import os
from rdkit import Chem

Version = 1.0
###########################################################################
def _ReadPatts(fileName):
    
  """ 
  #################################################################
  *Internal Use Only*
  
  parses the pattern list from the data file
  #################################################################
  """
  patts = {}
  order = []
  with open(fileName,'r') as f:
    lines = f.readlines()
  for line in lines:
    if line[0] != '#':
      splitLine = line.split('\t')
      if len(splitLine) >= 4 and splitLine[0] != '':
        sma = splitLine[1]
        if sma != 'SMARTS':
          sma.replace('"','')
          p = Chem.MolFromSmarts(sma)
          if p:
            cha = string.strip(splitLine[0])
            if cha not in order:
              order.append(cha)
            l = patts.get(cha,[])
            l.append((sma,p))
            patts[cha] = l
        else:
          print('Problems parsing smarts: %s'%(sma))
  return order,patts


###########################################################################
[docs]def GhoseCrippenFingerprint(mol,count = False): """ ################################################################# Ghose-Crippen substructures or counts based on the definitions of SMARTS from Ghose-Crippen's paper. (110 dimension) The result is a dict format. ################################################################# """ order, patts = _ReadPatts(os.path.dirname(os.path.abspath(__file__))+"/Crippen.txt") GCres = dict() for sma in patts: match = mol.GetSubstructMatches(patts[sma][0][1],False,False) temp = len([i[0] for i in match]) GCres.update({sma:temp}) res = {} if count == False: for i in GCres: if GCres[i] > 0: res.update({i:1}) else: res.update({i:0}) else: res = GCres return res
############################################################################### if __name__ =='__main__': smif = ['CCCC','CCCCC','CCCCCC','CC(N)C(=O)O','CC(N)C(=O)[O-].[Na+]'] AllDes = [] for i in smif: mol=Chem.MolFromSmiles(i) order, patts = _ReadPatts(os.path.dirname(os.path.abspath(__file__))+"/Crippen.txt") temp = GhoseCrippenFingerprint(mol,count = True) AllDes.append(temp) print AllDes