1 import sys, os
2 import itertools
3 import types
4 thisPath = os.path.dirname(os.path.abspath(__file__))
5 sys.path.append(os.path.abspath(os.path.join(thisPath,"..")))
6 import Utils.STFormat.STTools
7 import Utils.STFormat.ConvertXML
8 import Utils.STFormat.Equiv
9 import Utils.ElementTreeUtils as ETUtils
10 import Tools.GeniaSentenceSplitter
11 import Tools.BLLIPParser
12 import Tools.StanfordParser
13 import Tools.BANNER
14 from ToolChain import ToolChain
15 import Utils.InteractionXML.DivideSets
16 import Utils.ProteinNameSplitter as ProteinNameSplitter
17 import Utils.FindHeads as FindHeads
18
19 import Utils.Stream as Stream
20
25
27 steps = []
28 steps.append( ("CONVERT", self.convert, {"dataSetNames":None, "corpusName":None}, "documents.xml") )
29 steps.append( ("SPLIT-SENTENCES", Tools.GeniaSentenceSplitter.makeSentences, {"debug":False, "postProcess":True}, "sentences.xml") )
30 steps.append( ("NER", Tools.BANNER.run, {"elementName":"entity", "processElement":"sentence", "debug":False, "splitNewlines":True}, "ner.xml") )
31 steps.append( ("PARSE", Tools.BLLIPParser.parse, {"parseName":"McCC", "requireEntities":False, "debug":False}, "parse.xml") )
32 steps.append( ("CONVERT-PARSE", Tools.StanfordParser.convertXML, {"parser":"McCC", "debug":False}, "converted-parse.xml") )
33 steps.append( ("SPLIT-NAMES", ProteinNameSplitter.mainFunc, {"parseName":"McCC", "removeOld":True}, "split-names.xml") )
34 steps.append( ("FIND-HEADS", FindHeads.findHeads, {"parse":"McCC", "removeExisting":True}, "heads.xml") )
35 steps.append( ("DIVIDE-SETS", self.divideSets, {"outputStem":None, "saveCombined":True}) )
36 return steps
37
38 - def process(self, source, output, parameters=None, model=None, sourceDataSetNames=None, fromStep=None, toStep=None, omitSteps=None):
39
40
41
42
43
44
45
46
47
48
49 xml = ToolChain.process(self, source, output, parameters, model, fromStep, toStep, omitSteps)
50
51
52
53 return xml
54
55 - def convert(self, input, dataSetNames=None, corpusName=None, output=None):
56 if os.path.isdir(input) or input.endswith(".tar.gz") or input.endswith(".txt") or "," in input:
57 print >> sys.stderr, "Converting ST-format to Interaction XML"
58
59 dataSetDirs = input
60 documents = []
61 if type(dataSetDirs) in types.StringTypes:
62 dataSetDirs = dataSetDirs.split(",")
63
64 if dataSetNames == None:
65 dataSetNames = []
66 elif type(dataSetNames) in types.StringTypes:
67 dataSetNames = dataSetNames.split(",")
68
69 for dataSetDir, dataSetName in itertools.izip_longest(dataSetDirs, dataSetNames, fillvalue=None):
70 print >> sys.stderr, "Reading", dataSetDir, "set,",
71 docs = Utils.STFormat.STTools.loadSet(dataSetDir, dataSetName)
72 print >> sys.stderr, len(docs), "documents"
73 documents.extend(docs)
74 print >> sys.stderr, "Resolving equivalences"
75 Utils.STFormat.Equiv.process(documents)
76 if corpusName == None:
77 corpusName = "TEES"
78 self.xml = Utils.STFormat.ConvertXML.toInteractionXML(documents, corpusName, output)
79 else:
80 print >> sys.stderr, "Processing source as interaction XML"
81 self.xml = ETUtils.ETFromObj(input)
82 return self.xml
83
84 - def divideSets(self, input, outputStem, saveCombined=True):
85 if outputStem != None:
86 print >> sys.stderr, "Dividing into sets"
87 outDir, outputStem = os.path.split(outputStem)
88 Utils.InteractionXML.DivideSets.processCorpus(input, outDir, outputStem, ".xml", saveCombined=saveCombined)
89 else:
90 print >> sys.stderr, "No set division"
91
92 if __name__=="__main__":
93
94 try:
95 import psyco
96 psyco.full()
97 print >> sys.stderr, "Found Psyco, using"
98 except ImportError:
99 print >> sys.stderr, "Psyco not installed"
100 from optparse import OptionParser
101 optparser = OptionParser(description="A tool chain for making interaction XML, sentence splitting, NER and parsing")
102 optparser.add_option("-i", "--input", default=None, dest="input", help="")
103 optparser.add_option("-n", "--inputNames", default=None, dest="inputNames", help="")
104 optparser.add_option("-c", "--corpus", default=None, dest="corpus", help="corpus name")
105 optparser.add_option("-o", "--output", default=None, dest="output", help="output directory")
106 optparser.add_option("-p", "--parameters", default=None, dest="parameters", help="preprocessing parameters")
107 optparser.add_option("-s", "--step", default=None, dest="step", help="")
108 optparser.add_option("-t", "--toStep", default=None, dest="toStep", help="")
109 optparser.add_option("--omitSteps", default=None, dest="omitSteps", help="")
110 optparser.add_option("--noLog", default=False, action="store_true", dest="noLog", help="")
111 optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="")
112 optparser.add_option("--requireEntities", default=False, action="store_true", dest="requireEntities", help="")
113 (options, args) = optparser.parse_args()
114 if options.omitSteps != None:
115 options.omitSteps = options.omitSteps.split(",")
116
117 if not options.noLog:
118 Stream.openLog(os.path.join(options.output + "-log.txt"))
119
120 preprocessor = Preprocessor()
121 preprocessor.setArgForAllSteps("debug", options.debug)
122 preprocessor.stepArgs("CONVERT")["corpusName"] = options.corpus
123 preprocessor.stepArgs("PARSE")["requireEntities"] = options.requireEntities
124 preprocessor.process(options.input, options.output, options.parameters, None, options.inputNames, fromStep=options.step, toStep=options.toStep, omitSteps=options.omitSteps)
125