Monday, January 08, 2007

A simple CML parser written in Python

Python is a powerful programming language. It can be easily extended by importing modules.
Below is a simple module for parsing CML file. As you can see, it's really easy to add the support for another tags.



import xml.sax.handler

class CMLHandler(xml.sax.handler.ContentHandler):
def __init__(self):
self.id = ""
self.formula = ""
self.inIdentifier = False
self.inchi = "InChI/1="
self.inInChI = False
self.inBasic = False
self.name = ""
self.inName = False
self.weight = ""
self.inWeight = False
self.mpt = ""
self.mptSet = False
self.inMpt = False
self.bpt = ""
self.inBpt = False
self.bptSet = False

def startElement(self, name, attributes):
if name == "molecule":
self.id = attributes["id"]

if name == "formula":
self.formula = attributes["concise"]

if name == "identifier":
self.inIdentifier = True
if attributes.has_key("version") and attributes["version"] == "InChI/1":
self.inInChI = True

if name == "basic":
self.inBasic = True

if name == "name":
self.inName = True

if name == "scalar":
if attributes["dictRef"] == "cml:molwt":
self.inWeight = True
elif attributes["dictRef"] == "cml:mpt":
self.inMpt = True
elif attributes["dictRef"] == "cml:bpt":
self.inBpt = True

def characters(self, data):
if self.inName:
self.name += data

if self.inWeight:
self.weight += data

if self.inMpt:
self.mpt += data

if self.inBpt:
self.bpt += data

if self.inBasic and self.inInChI:
self.inchi += data

def endElement(self,name):
if name == "identifier":
self.inIdenfitier = False

if name == "basic":
self.inBasic = False
if self.inInChI:
self.inInChI = False

if name == "name":
self.inName = False

if name == "inchi":
self.inInChI = False

if name == "scalar":
if self.inWeight:
self.inWeight = False
elif self.inMpt:
self.inMpt = False
self.mptSet = True
elif self.inBpt:
self.inBpt = False
self.bptSet = True