1 import sys, os, shutil, codecs
2 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../..")
3 from Utils.ProgressCounter import ProgressCounter
4 from Tools.BLLIPParser import escDict
5 try:
6 import xml.etree.cElementTree as ET
7 except ImportError:
8 import cElementTree as ET
9 import Utils.ElementTreeUtils as ETUtils
10 from collections import defaultdict
11
12 unEscDict = {}
13 for k, v in escDict.iteritems():
14 unEscDict[v] = k
15
16 -def getTokenText(tokenElement):
17
18 return tokenElement.get("text").replace("\n", " ").replace("\r", " ").strip()
19
21
22 tokenElements = []
23 for tokenElement in tokenizationElement.findall("token"):
24 charOffset = tokenElement.get("charOffset")
25 begin, end = charOffset.split("-")
26 tokenElements.append( [int(begin), int(end), tokenElement] )
27 tokenElements.sort()
28
29
30 index = 0
31 tokenTexts = []
32 tokenIdMap = {}
33 splitFrom = None
34 for tokenElement in tokenElements:
35 token = tokenElement[2]
36 if token.get("splitFrom") != None:
37 if splitFrom != token.get("splitFrom"):
38 splitFrom = token.get("splitFrom")
39 tokenTexts.append(getTokenText(token))
40 else:
41 tokenTexts[-1] = tokenTexts[-1] + getTokenText(token)
42 else:
43 splitFrom = None
44 tokenTexts.append(getTokenText(token))
45 tokenIdMap[index] = len(tokenTexts) - 1
46 index += 1
47 return tokenTexts, tokenIdMap
48
50 pennstring = None
51 if parseElement != None:
52 pennstring = parseElement.get("pennstring")
53 if tokenizationElement != None and pennstring != None and pennstring.strip() != "":
54 tokenTexts = []
55 tokenTexts, tokenIdMap = getTokens(tokenizationElement)
56 outFile.write(" ".join(tokenTexts) + "\n")
57 else:
58 outFile.write(" ".join(sentenceElement.get("text").strip().split()) + "\n")
59 return True
60
62 pennstring = None
63 if parseElement != None:
64 pennstring = parseElement.get("pennstring")
65 if pennstring != None and pennstring.strip() != "":
66 outFile.write(pennstring.strip())
67 outFile.write("\n")
68 if pennstring == None:
69 return False
70 else:
71 return True
72
74 global unEscDict
75 escDictKeys = sorted(unEscDict.keys())
76
77 tokens = []
78
79 if tokenizationElement != None:
80 tokens, tokenIdMap = getTokens(tokenizationElement)
81 for i in range(len(tokens)):
82 for key in escDictKeys:
83 tokens[i] = tokens[i].replace(key, unEscDict[key])
84
85
86 if parseElement != None:
87 for dependency in parseElement.findall("dependency"):
88 if dependency.get("split") != None:
89 continue
90 t1Index = tokenIdMap[int(dependency.get("t1").split("_")[-1]) + tokenIdOffset]
91 t2Index = tokenIdMap[int(dependency.get("t2").split("_")[-1]) + tokenIdOffset]
92 assert t1Index < len(tokens), (t1Index, tokens, tokenIdMap, dependency.attrib)
93 assert t2Index < len(tokens), (t2Index, tokens, tokenIdMap, dependency.attrib)
94 t1 = tokens[t1Index] + "-" + str(t1Index + 1)
95 t2 = tokens[t2Index] + "-" + str(t2Index + 1)
96 outFile.write(dependency.get("type") + "(" + t1 + ", " + t2 + ")\n")
97 outFile.write("\n")
98 if parseElement != None:
99 return True
100 else:
101 return False
102
103 -def export(input, output, parse, tokenization=None, toExport=["tok", "ptb", "sd"], inputSuffixes=None, clear=False, tokenIdOffset=0):
104 print >> sys.stderr, "##### Export Parse #####"
105
106 if os.path.exists(output) and clear:
107 shutil.rmtree(output)
108 if not os.path.exists(output):
109 os.makedirs(output)
110 if inputSuffixes != None:
111 inputFileNames = []
112 for suffix in inputSuffixes:
113 inputFileNames.append(input + suffix)
114 else:
115 inputFileNames = [input]
116
117 for inputFileName in inputFileNames:
118 print >> sys.stderr, "Processing input file", inputFileName
119 corpusRoot = ETUtils.ETFromObj(inputFileName).getroot()
120 documents = corpusRoot.findall("document")
121 counter = ProgressCounter(len(documents), "Documents")
122 counts = defaultdict(int)
123 for document in documents:
124 counter.update()
125 docId = document.get("pmid")
126 if docId == None:
127 docId = document.get("origId")
128 if docId == None:
129 docId = document.get("id")
130 counts["document"] += 1
131
132 outfiles = {}
133 for fileExt in toExport:
134 outfilePath = output + "/" + docId + "." + fileExt
135 assert not os.path.exists(outfilePath)
136 outfiles[fileExt] = codecs.open(outfilePath, "wt", "utf-8")
137
138 for sentence in document.findall("sentence"):
139 counts["sentence"] += 1
140 parseElement = None
141 for e in sentence.getiterator("parse"):
142 if e.get("parser") == parse:
143 parseElement = e
144 counts["parse"] += 1
145 break
146 if tokenization == None:
147 tokenization = parseElement.get("tokenizer")
148 tokenizationElement = None
149 for e in sentence.getiterator("tokenization"):
150 if e.get("tokenizer") == tokenization:
151 tokenizationElement = e
152 counts["tokenization"] += 1
153 break
154 if "tok" in outfiles:
155 if exportTokenization(tokenizationElement, parseElement, sentence, outfiles["tok"]):
156 counts["tok"] += 1
157 if "ptb" in outfiles:
158 if exportPennTreeBank(parseElement, outfiles["ptb"]):
159 counts["ptb"] += 1
160 if "sd" in outfiles:
161 if exportStanfordDependencies(parseElement, tokenizationElement, outfiles["sd"], tokenIdOffset):
162 counts["sd"] += 1
163
164 for fileExt in outfiles:
165 outfiles[fileExt].close()
166 outfiles[fileExt] = None
167
168 print >> sys.stderr, "Parse export counts:"
169 for k in sorted(counts.keys()):
170 print >> sys.stderr, " " + str(k) + ":", counts[k]
171
172 if __name__=="__main__":
173 from optparse import OptionParser
174
175 try:
176 import psyco
177 psyco.full()
178 print >> sys.stderr, "Found Psyco, using"
179 except ImportError:
180 print >> sys.stderr, "Psyco not installed"
181
182 optparser = OptionParser(usage="%prog [options]\n")
183 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE")
184 optparser.add_option("-s", "--inputSuffixes", default=None, dest="inputSuffixes", help="e.g. '-train.xml,-devel.xml,-test.xml'", metavar="FILE")
185 optparser.add_option("-o", "--output", default=None, dest="output", help="Output directory.")
186 optparser.add_option("-p", "--parse", default=None, dest="parse", help="")
187 optparser.add_option("-c", "--clear", default=False, action="store_true", dest="clear", help="")
188 optparser.add_option("--tokenIdOffset", default=0, type="int", dest="tokenIdOffset", help="")
189 (options, args) = optparser.parse_args()
190
191 if options.inputSuffixes != None:
192 options.inputSuffixes = options.inputSuffixes.split(",")
193 export(options.input, options.output, options.parse, clear=options.clear, inputSuffixes=options.inputSuffixes, tokenIdOffset=tokenIdOffset)
194