1  import sys, os, shutil, codecs 
  2  sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../..") 
  3  from Utils.ProgressCounter import ProgressCounter 
  4  from Tools.BLLIPParser import escDict 
  5  try: 
  6      import xml.etree.cElementTree as ET 
  7  except ImportError: 
  8      import cElementTree as ET 
  9  import Utils.ElementTreeUtils as ETUtils 
 10  from collections import defaultdict 
 11   
 12  unEscDict = {} 
 13  for k, v in escDict.iteritems(): 
 14      unEscDict[v] = k 
 15   
 16 -def getTokenText(tokenElement): 
  17       
 18      return tokenElement.get("text").replace("\n", " ").replace("\r", " ").strip() 
  19   
 21       
 22      tokenElements = [] 
 23      for tokenElement in tokenizationElement.findall("token"): 
 24          charOffset = tokenElement.get("charOffset") 
 25          begin, end = charOffset.split("-") 
 26          tokenElements.append( [int(begin), int(end), tokenElement] ) 
 27      tokenElements.sort() 
 28       
 29       
 30      index = 0 
 31      tokenTexts = [] 
 32      tokenIdMap = {}  
 33      splitFrom = None 
 34      for tokenElement in tokenElements: 
 35          token = tokenElement[2] 
 36          if token.get("splitFrom") != None: 
 37              if splitFrom != token.get("splitFrom"):  
 38                  splitFrom = token.get("splitFrom") 
 39                  tokenTexts.append(getTokenText(token)) 
 40              else:  
 41                  tokenTexts[-1] = tokenTexts[-1] + getTokenText(token) 
 42          else:  
 43              splitFrom = None 
 44              tokenTexts.append(getTokenText(token)) 
 45          tokenIdMap[index] = len(tokenTexts) - 1 
 46          index += 1 
 47      return tokenTexts, tokenIdMap 
  48   
 50      pennstring = None 
 51      if parseElement != None: 
 52          pennstring = parseElement.get("pennstring") 
 53      if tokenizationElement != None and pennstring != None and pennstring.strip() != "": 
 54          tokenTexts = [] 
 55          tokenTexts, tokenIdMap = getTokens(tokenizationElement) 
 56          outFile.write(" ".join(tokenTexts) + "\n") 
 57      else: 
 58          outFile.write(" ".join(sentenceElement.get("text").strip().split()) + "\n") 
 59      return True   
  60   
 62      pennstring = None 
 63      if parseElement != None: 
 64          pennstring = parseElement.get("pennstring") 
 65      if pennstring != None and pennstring.strip() != "": 
 66          outFile.write(pennstring.strip()) 
 67      outFile.write("\n") 
 68      if pennstring == None: 
 69          return False 
 70      else: 
 71          return True 
  72   
 74      global unEscDict 
 75      escDictKeys = sorted(unEscDict.keys()) 
 76       
 77      tokens = [] 
 78       
 79      if tokenizationElement != None: 
 80          tokens, tokenIdMap = getTokens(tokenizationElement) 
 81          for i in range(len(tokens)): 
 82              for key in escDictKeys: 
 83                  tokens[i] = tokens[i].replace(key, unEscDict[key]) 
 84           
 85       
 86      if parseElement != None: 
 87          for dependency in parseElement.findall("dependency"): 
 88              if dependency.get("split") != None:  
 89                  continue 
 90              t1Index = tokenIdMap[int(dependency.get("t1").split("_")[-1]) + tokenIdOffset]  
 91              t2Index = tokenIdMap[int(dependency.get("t2").split("_")[-1]) + tokenIdOffset]  
 92              assert t1Index < len(tokens), (t1Index, tokens, tokenIdMap, dependency.attrib) 
 93              assert t2Index < len(tokens), (t2Index, tokens, tokenIdMap, dependency.attrib) 
 94              t1 = tokens[t1Index] + "-" + str(t1Index + 1) 
 95              t2 = tokens[t2Index] + "-" + str(t2Index + 1) 
 96              outFile.write(dependency.get("type") + "(" + t1 + ", " + t2 + ")\n") 
 97      outFile.write("\n")  
 98      if parseElement != None: 
 99          return True 
100      else: 
101          return False 
 102   
103 -def export(input, output, parse, tokenization=None, toExport=["tok", "ptb", "sd"], inputSuffixes=None, clear=False, tokenIdOffset=0): 
 104      print >> sys.stderr, "##### Export Parse #####" 
105       
106      if os.path.exists(output) and clear: 
107          shutil.rmtree(output) 
108      if not os.path.exists(output): 
109          os.makedirs(output) 
110      if inputSuffixes != None: 
111          inputFileNames = [] 
112          for suffix in inputSuffixes: 
113              inputFileNames.append(input + suffix) 
114      else: 
115          inputFileNames = [input] 
116   
117      for inputFileName in inputFileNames: 
118          print >> sys.stderr, "Processing input file", inputFileName 
119          corpusRoot = ETUtils.ETFromObj(inputFileName).getroot() 
120          documents = corpusRoot.findall("document") 
121          counter = ProgressCounter(len(documents), "Documents") 
122          counts = defaultdict(int) 
123          for document in documents: 
124              counter.update() 
125              docId = document.get("pmid") 
126              if docId == None: 
127                  docId = document.get("origId") 
128              if docId == None: 
129                  docId = document.get("id") 
130              counts["document"] += 1 
131               
132              outfiles = {} 
133              for fileExt in toExport: 
134                  outfilePath = output + "/" + docId + "." + fileExt 
135                  assert not os.path.exists(outfilePath)  
136                  outfiles[fileExt] = codecs.open(outfilePath, "wt", "utf-8") 
137               
138              for sentence in document.findall("sentence"): 
139                  counts["sentence"] += 1 
140                  parseElement = None 
141                  for e in sentence.getiterator("parse"): 
142                      if e.get("parser") == parse: 
143                          parseElement = e 
144                          counts["parse"] += 1 
145                          break 
146                  if tokenization == None: 
147                      tokenization = parseElement.get("tokenizer") 
148                  tokenizationElement = None 
149                  for e in sentence.getiterator("tokenization"): 
150                      if e.get("tokenizer") == tokenization: 
151                          tokenizationElement = e 
152                          counts["tokenization"] += 1 
153                          break 
154                  if "tok" in outfiles: 
155                      if exportTokenization(tokenizationElement, parseElement, sentence, outfiles["tok"]): 
156                          counts["tok"] += 1 
157                  if "ptb" in outfiles: 
158                      if exportPennTreeBank(parseElement, outfiles["ptb"]): 
159                          counts["ptb"] += 1 
160                  if "sd" in outfiles: 
161                      if exportStanfordDependencies(parseElement, tokenizationElement, outfiles["sd"], tokenIdOffset): 
162                          counts["sd"] += 1 
163               
164              for fileExt in outfiles: 
165                  outfiles[fileExt].close() 
166                  outfiles[fileExt] = None 
167           
168      print >> sys.stderr, "Parse export counts:" 
169      for k in sorted(counts.keys()): 
170          print >> sys.stderr, "  " + str(k) + ":", counts[k] 
 171   
172  if __name__=="__main__": 
173      from optparse import OptionParser 
174       
175      try: 
176          import psyco 
177          psyco.full() 
178          print >> sys.stderr, "Found Psyco, using" 
179      except ImportError: 
180          print >> sys.stderr, "Psyco not installed" 
181   
182      optparser = OptionParser(usage="%prog [options]\n") 
183      optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE") 
184      optparser.add_option("-s", "--inputSuffixes", default=None, dest="inputSuffixes", help="e.g. '-train.xml,-devel.xml,-test.xml'", metavar="FILE") 
185      optparser.add_option("-o", "--output", default=None, dest="output", help="Output directory.") 
186      optparser.add_option("-p", "--parse", default=None, dest="parse", help="") 
187      optparser.add_option("-c", "--clear", default=False, action="store_true", dest="clear", help="") 
188      optparser.add_option("--tokenIdOffset", default=0, type="int", dest="tokenIdOffset", help="") 
189      (options, args) = optparser.parse_args() 
190       
191      if options.inputSuffixes != None: 
192          options.inputSuffixes = options.inputSuffixes.split(",") 
193      export(options.input, options.output, options.parse, clear=options.clear, inputSuffixes=options.inputSuffixes, tokenIdOffset=tokenIdOffset) 
194