1  import sys, os 
  2  import codecs 
  3  import Validate 
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18   
 19   
 22          self.id = None 
 23          self.text = None 
 24          self.proteins = [] 
 25          self.triggers = [] 
 26          self.events = [] 
 27          self.relations = [] 
 28          self.dataSet = None 
 29          self.license = None 
   30   
 32 -    def __init__(self, id = None, type = None, text=None, trigger=None, arguments=None): 
  33          self.id = id  
 34          self.type = type  
 35          self.text = text  
 36          self.charBegin = -1  
 37          self.charEnd = -1  
 38          self.alternativeOffsets = [] 
 39          self.equiv = []  
 40          self.trigger = trigger  
 41          self.arguments = []  
 42          if arguments != None: 
 43              self.arguments = arguments 
 44          self.sites = [] 
 45          self.speculation = None  
 46          self.negation = None  
 47          self.fileType = None  
 48           
 49          self.triggerScores = None 
 50          self.unmergingScores = None 
 51          self.speculationScores = None 
 52          self.negationScores = None 
  53       
 55          return self.negation != None 
  56       
 58          return self.speculation != None 
  59       
 61          return self.type == "Protein" or self.type == "Gene" 
  62   
 63       
 65          if self.id == None: 
 66              return "<Ann NO-ID>" 
 67          else: 
 68              return "<Ann " + self.id + ">" 
   69   
 70 -def getStatistics(documents, printStats=True, statSeparator="\n"): 
  71      from collections import defaultdict 
 72      import types 
 73      if type(documents) in types.StringTypes: 
 74          documents = loadSet(documents) 
 75       
 76      stats = defaultdict(int) 
 77      for document in documents: 
 78          stats["total-docs"] += 1 
 79          stats["total-events"] += len(document.events) 
 80          stats["total-relations"] += len(document.relations) 
 81          stats["total-proteins"] += len(document.proteins) 
 82          stats["doc-events-"+str(len(document.events))] += 1 
 83          stats["doc-relations-"+str(len(document.relations))] += 1 
 84          stats["doc-proteins-"+str(len(document.proteins))] += 1 
 85          for event in document.events: 
 86              stats["events-"+event.type] += 1 
 87              if event.speculation != None: 
 88                  stats["events-"+event.type+"-spec"] += 1 
 89              if event.negation != None: 
 90                  stats["events-"+event.type+"-neg"] += 1 
 91              argStats = defaultdict(int) 
 92              nesting = False 
 93              for arg in event.arguments: 
 94                  argType = arg[0] 
 95                  if arg[2] != None: 
 96                      argType += "(" + arg[2].type + ")" 
 97                  if not arg[1].isName(): 
 98                      nesting = True 
 99                  argStats[argType] += 1 
100              if nesting: 
101                  stats["events-"+event.type+"-parent"] += 1 
102              stats["args-"+event.type+"-"+"-".join([str(key)+"_"+str(argStats[key]) for key in sorted(argStats.keys())]) ] += 1 
103      if printStats: 
104          print >> sys.stderr, "Event Statistics:" 
105          print >> sys.stderr, statSeparator.join([str(key)+":"+str(stats[key]) for key in sorted(stats.keys())]) 
106      return stats 
 107   
109       
110      assert string[0] == "T" or string[0] == "W", string 
111      string = string.strip() 
112      ann = Annotation() 
113      splits = string.split("\t") 
114      ann.id = splits[0] 
115      middle = splits[1] 
116      ann.text = splits[2] 
117      if readScores: 
118          ann.triggerScores = splits[3] 
119       
120      ann.type, ann.charBegin, ann.charEnd = middle.split() 
121      ann.charBegin = int(ann.charBegin) 
122      ann.charEnd = int(ann.charEnd) 
123       
124   
125   
126   
127   
128   
129   
130   
131   
132   
133   
134      return ann 
 135   
137      assert string[0] == "*", string 
138      string = string.strip() 
139      star, rest = string.split("\t") 
140      equivs = [] 
141      if rest.find("Equiv") == 0: 
142          splits = rest.split(" ") 
143          type = splits[0] 
144          assert type == "Equiv" 
145          entities = splits[1:]  
146          equivs.append( entities ) 
147      if len(equivs) > 0: 
148          protMap = {} 
149          for protein in proteins: 
150              protMap[protein.id] = protein 
151          for equiv in equivs: 
152              for member in equiv: 
153                  for other in equiv: 
154                      if member == other: 
155                          continue 
156                      if not protMap[other] in protMap[member].equiv: 
157                          protMap[member].equiv.append(protMap[other]) 
 158   
159 -def readEvent(string, sitesAreArguments=False, readScores=False): 
 160      string = string.strip() 
161      ann = Annotation() 
162      ann.id, rest = string.split("\t") 
163      args = rest.split() 
164      trigger = args[0] 
165      args = args[1:] 
166      splits = trigger.split(":") 
167      ann.type = splits[0] 
168      ann.trigger = None 
169      if len(splits) > 1: 
170          if "=" not in splits[1]: 
171              ann.trigger = splits[1] 
172          elif readScores: 
173              ann.unmergingScores = splits[1] 
174          if len(splits) > 2 and readScores: 
175              assert "=" in splits[2] 
176              ann.unmergingScores = splits[2] 
177   
178   
179   
180   
181   
182      argMap = {} 
183       
184      for arg in args: 
185          argTuple = arg.split(":") 
186          argScores = [] 
187          if len(argTuple) > 2: 
188              if readScores: 
189                  argScores = argTuple[2:] 
190              argTuple = argTuple[:2] 
191          argTuple += [None] + argScores + [None]  
192           
193           
194           
195           
196          if sitesAreArguments or argTuple[0].find("Site") == -1 or ann.type == "SiteOf":  
197              origArgName = argTuple[0] 
198              if argTuple[0].find("Theme") != -1:  
199                  argTuple[0] = "Theme"  
200              assert origArgName != ""  
201              argMap[origArgName] = argTuple 
202              ann.arguments.append( argTuple ) 
203              if "Site" in argTuple[0]: 
204                  assert argTuple[0] == "Site" 
205                  argTuple[0] = "SiteArg" 
206       
207      if len(argMap.keys()) != len(args):  
208          for arg in args: 
209              argTuple = arg.split(":") 
210              if "Site" in argTuple[0]: 
211                  if argTuple[0] == "CSite": 
212                      target = "Cause" 
213                  else: 
214                      target = "Theme" + argTuple[0][4:] 
215                  if target not in argMap:  
216                      assert "Theme" in target 
217                      assert "Theme" in argMap 
218                      target = "Theme" 
219                  argMap[target][2] = argTuple[1] 
220                  if readScores and len(argTuple) > 2: 
221                      argMap[target][4] = argTuple[2]  
222      return ann 
 223   
225      string = string.strip() 
226      ann = Annotation() 
227      tabSplits = string.split("\t") 
228      ann.id = tabSplits[0] 
229      args = tabSplits[1].split() 
230      ann.type = args[0] 
231      args = args[1:] 
232      argMap = {} 
233       
234      for arg in args: 
235          argTuple = arg.split(":") 
236           
237          if readScores and len(argTuple) > 2: 
238              ann.arguments.append( [argTuple[0], argTuple[1], None, argTuple[2], None] ) 
239          else: 
240              ann.arguments.append( [argTuple[0], argTuple[1], None] ) 
241      if len(tabSplits) == 3: 
242          assert ann.type == "Coref" 
243          assert tabSplits[2][0] == "[" and tabSplits[2][-1] == "]", (string, tabSplits) 
244          protIds = tabSplits[2][1:-1].split(",") 
245          for protId in protIds: 
246              ann.arguments.append( ["Connected", protId.strip(), None] ) 
247      return ann 
 248   
250      string = string.strip() 
251      id, depType, word1, word2 = string.split() 
252      assert word1[0] == "W" and word2[0] == "W", string 
253      ann = Annotation() 
254      ann.id = id 
255      ann.type = depType 
256      ann.arguments = [("Word", word1), ("Word", word2)] 
257      return ann 
 258   
260       
261      f = codecs.open(filename, "rt", "utf-8") 
262      proteins = [] 
263      words = [] 
264      dependencies = [] 
265      lines = f.readlines() 
266      count = 0 
267      for line in lines: 
268          if line[0] == "T": 
269              proteins.append(readTAnnotation(line)) 
270              count += 1 
271      for line in lines: 
272          if line[0] == "*": 
273              readStarAnnotation(line, proteins) 
274              count += 1 
275      for line in lines: 
276          if line[0] == "W": 
277              words.append(readTAnnotation(line)) 
278              count += 1 
279      for line in lines: 
280          if line[0] == "R":  
281              dependencies.append(readDependencyAnnotation(line)) 
282              count += 1 
283      assert count == len(lines), lines  
284      f.close() 
285       
286      for ann in proteins + words + dependencies: 
287          ann.fileType = "a1" 
288       
289      if len(words) > 0: 
290          wordMap = {} 
291          for word in words: 
292              wordMap[word.id] = word 
293          for dep in dependencies: 
294              for i in range(len(dep.arguments)): 
295                  arg = dep.arguments[i] 
296                  dep.arguments[i] = (arg[0], wordMap[arg[1]]) 
297      return proteins, words, dependencies 
 298   
299 -def loadRelOrA2(filename, proteins, sitesAreArguments=False, readScores=False): 
 300      if readScores and os.path.exists(filename + ".scores"): 
301           
302          f = codecs.open(filename + ".scores", "rt", "utf-8") 
303      else: 
304           
305          f = codecs.open(filename, "rt", "utf-8") 
306      triggers = [] 
307      triggerMap = {} 
308      for protein in proteins: 
309          triggerMap[protein.id] = protein 
310      events = [] 
311      eventMap = {} 
312      relations = [] 
313      lines = f.readlines() 
314      f.close() 
315      count = 0 
316      for line in lines: 
317          if line[0] == "T": 
318              triggers.append( readTAnnotation(line, readScores=readScores) ) 
319              triggerMap[triggers[-1].id] = triggers[-1] 
320              count += 1 
321      for line in lines: 
322          if line[0] == "E": 
323              events.append( readEvent(line, sitesAreArguments, readScores=readScores) ) 
324              eventMap[events[-1].id] = events[-1] 
325              count += 1 
326      for line in lines: 
327          if line[0] == "R": 
328              relations.append(readRAnnotation(line, readScores=readScores)) 
329               
330               
331               
332              count += 1 
333      for line in lines: 
334          if line[0] == "M": 
335              if not readScores: 
336                  mId, rest = line.strip().split("\t") 
337                  mScore = None 
338              else: 
339                  mId, rest, mScore = line.strip().split("\t") 
340              mType, eventId = rest.split() 
341              assert mType in ["Speculation", "Negation"] 
342              if mType == "Speculation": 
343                  eventMap[eventId].speculation = mId 
344                  eventMap[eventId].speculationScores = mScore 
345              elif mType == "Negation": 
346                  eventMap[eventId].negation = mId 
347                  eventMap[eventId].negationScores = mScore 
348              count += 1 
349      for line in lines: 
350          if line[0] == "*": 
351              readStarAnnotation(line, proteins + triggers) 
352              count += 1 
353      assert count == len(lines), lines  
354       
355       
356      for ann in triggers + events + relations: 
357          ann.fileType = "a2" 
358       
359      for event in events: 
360           
361          if event.trigger != None: 
362              event.trigger = triggerMap[event.trigger] 
363               
364              event.trigger.unmergingScores = event.unmergingScores 
365              event.trigger.negationScores = event.negationScores 
366              event.trigger.speculationScores = event.speculationScores 
367              event.unmergingScores = None 
368              event.negationScores = None 
369              event.speculationScores = None 
370               
371          for i in range(len(event.arguments)): 
372              arg = event.arguments[i] 
373              if arg[1][0] == "T": 
374                  if arg[2] != None: 
375                       
376                      event.arguments[i][1] = triggerMap[arg[1]] 
377                      event.arguments[i][2] = triggerMap[arg[2]] 
378                  else: 
379                       
380                      event.arguments[i][1] = triggerMap[arg[1]] 
381              elif arg[1][0] == "E": 
382                  assert arg[2] == None, (filename, event.id, arg, event.arguments)  
383                   
384                  event.arguments[i][1] = eventMap[arg[1]] 
385       
386      for relation in relations: 
387          for i in range(len(relation.arguments)): 
388              arg = relation.arguments[i] 
389              if arg[1][0] == "T": 
390                  if arg[2] != None: 
391                       
392                      relation.arguments[i][1] = triggerMap[arg[1]] 
393                      relation.arguments[i][2] = triggerMap[arg[2]] 
394                  else: 
395   
396   
397   
398   
399   
400                       
401                      relation.arguments[i][1] = triggerMap[arg[1]] 
402   
403      return triggers, events, relations 
 404   
405 -def loadText(filename): 
 406       
407      f = codecs.open(filename, "rt", "utf-8") 
408      text = f.read() 
409      f.close() 
410      return text 
 411   
412 -def load(id, dir, loadA2=True, sitesAreArguments=False, a2Tag="a2", readScores=False): 
 413       
414      id = str(id) 
415      a1Path = os.path.join(dir, id + ".a1") 
416      if os.path.exists(a1Path): 
417          proteins, words, dependencies = loadA1(a1Path) 
418      else: 
419          proteins = [] 
420          words = [] 
421          dependencies = [] 
422      if not loadA2: 
423          return proteins, [], [], [], [], [] 
424      a2Path = os.path.join(dir, id + "." + a2Tag) 
425      relPath = os.path.join(dir, id + ".rel") 
426      triggers = [] 
427      events = [] 
428      relations = [] 
429      if os.path.exists(a2Path): 
430          triggers, events, relations = loadRelOrA2(a2Path, proteins, sitesAreArguments, readScores=readScores) 
431      elif os.path.exists(relPath): 
432          triggers, events, relations = loadRelOrA2(relPath, proteins, sitesAreArguments, readScores=readScores) 
433      return proteins, words, dependencies, triggers, events, relations 
 434   
435 -def loadSet(path, setName=None, level="a2", sitesAreArguments=False, a2Tag="a2", readScores=False): 
 436      assert level in ["txt", "a1", "a2"] 
437      if path.endswith(".tar.gz"): 
438          import tempfile 
439          import tarfile 
440          import shutil 
441          dir = tempfile.mkdtemp() 
442          f = tarfile.open(path, "r") 
443          f.extractall(dir) 
444           
445          compressedFilePath = os.path.join(dir, os.path.basename(path)[:-len(".tar.gz")]) 
446          if not os.path.exists(compressedFilePath):  
447              compressedFilePath = compressedFilePath.rsplit("_", 1)[0] 
448              print >> sys.stderr, "Package name directory does not exist, trying", compressedFilePath 
449          if os.path.exists(compressedFilePath): 
450              print >> sys.stderr, "Reading document set from compressed filename directory", compressedFilePath 
451              dir = compressedFilePath 
452          f.close() 
453      elif path.endswith(".txt"): 
454          import tempfile 
455          import shutil 
456          dir = tempfile.mkdtemp() 
457          shutil.copy2(path, os.path.join(dir, os.path.basename(path))) 
458      else: 
459          dir = path 
460       
461      ids = set() 
462      documents = [] 
463      license = None 
464      if os.path.exists(os.path.join(dir, "LICENSE")): 
465          licenseFile = open(os.path.join(dir, "LICENSE"), "rt") 
466          license = "".join(licenseFile.readlines()) 
467          licenseFile.close() 
468      for filename in os.listdir(dir): 
469          if filename.endswith(".txt"): 
470              ids.add(filename.split(".")[0]) 
471      for id in sorted(list(ids)): 
472           
473          doc = Document() 
474          doc.id = id 
475          if not level == "txt": 
476              try: 
477                  doc.proteins, doc.words, doc.dependencies, doc.triggers, doc.events, doc.relations = load(str(id), dir, level=="a2", sitesAreArguments, a2Tag=a2Tag, readScores=readScores) 
478              except: 
479                  print >> sys.stderr, "Exception reading document", id, "from", dir  
480                  raise 
481          doc.text = loadText( os.path.join(dir, str(id) + ".txt") ) 
482          doc.dataSet = setName 
483          doc.license = license 
484          documents.append(doc) 
485       
486      if dir != path: 
487          shutil.rmtree(dir) 
488      return documents 
 489   
490 -def writeSet(documents, output, resultFileTag="a2", debug=False, task=2, validate=True, writeScores=False): 
 491      from collections import defaultdict 
492      import shutil 
493      counts = defaultdict(int) 
494       
495      while output.endswith("/"): 
496          output = output[:-1] 
497      if output.endswith(".tar.gz"): 
498          outdir = output + "-temp" 
499      else: 
500          outdir = output 
501      if os.path.exists(outdir): 
502          shutil.rmtree(outdir) 
503   
504      if not validate: 
505          print "Warning! No validation." 
506      for doc in documents: 
507          if validate: 
508              if debug: print >> sys.stderr, "Validating", doc.id 
509              Validate.allValidate(doc, counts, task, verbose=debug) 
510           
511           
512          if debug: print >> sys.stderr, "Writing", doc.id 
513          write(doc.id, outdir, doc.proteins, doc.triggers, doc.events, doc.relations, resultFileTag, counts, task=task, writeScores=writeScores) 
514           
515           
516          out = codecs.open(os.path.join(outdir, str(doc.id) + ".txt"), "wt", "utf-8") 
517          out.write(doc.text) 
518          out.close() 
519      if output.endswith(".tar.gz"): 
520          package(outdir, output, ["a1", "txt", resultFileTag, resultFileTag+".scores"]) 
521          shutil.rmtree(outdir) 
522      print counts 
 523           
524   
526      nums = [0] 
527      for annotation in annotations: 
528          if annotation.id != None: 
529              assert annotation.id[1:].isdigit(), annotation.id 
530              nums.append(int(annotation.id[1:])) 
531      return max(nums) 
 532   
534      newIds = False 
535      for ann in annotations: 
536          if ann.id == None: 
537              newIds = True 
538              break 
539      if newIds: 
540          idCount = max(getMaxId(annotations) + 1, minId) 
541          for ann in annotations: 
542              if len(ann.arguments) == 0 and ann.trigger == None: 
543                  ann.id = "T" + str(idCount) 
544              elif ann.type in ["Subunit-Complex", "Protein-Component", "Coref", "Renaming", "SR-subunitof", "SR-equivto", "SR-partof", "SR-memberof"]: 
545                  ann.id = "R" + str(idCount) 
546               
547              else: 
548                  ann.id = "E" + str(idCount) 
549              idCount += 1 
 550   
552      updateIds(proteins, idStart) 
553      for protein in proteins: 
554          assert protein.id[0] == "T", (protein.id, protein.text) 
555          out.write(protein.id + "\t") 
556          out.write(protein.type + " " + str(protein.charBegin) + " " + str(protein.charEnd) + "\t") 
557          if protein.text == None: 
558              out.write(str(protein.text)) 
559          else: 
560              out.write(protein.text.replace("\n", "
").replace("\r", "
")) 
561          if writeScores and protein.triggerScores != None: 
562              out.write("\t" + protein.triggerScores.replace(":", "=")) 
563          out.write("\n") 
 564   
566       
567       
568       
569       
570       
571      duplicateMap = {} 
572      seenLineMap = {} 
573      for eventLineTuple in eventLines: 
574          if eventLineTuple[1] not in seenLineMap: 
575              seenLineMap[eventLineTuple[1]] = eventLineTuple[0] 
576          else: 
577              duplicateMap[eventLineTuple[0]] = seenLineMap[eventLineTuple[1]] 
578      return duplicateMap 
 579   
580   
581   
582   
583   
584   
585   
586   
587   
588 -def writeEvents(events, out, counts, task, writeScores=False): 
 589      updateIds(events) 
590      mCounter = 1 
591      eventLines = [] 
592      nestedEvents = set() 
593      for event in events: 
594          eventLine = "" 
595           
596           
597          trigger = event.trigger 
598          if trigger == None: 
599              eventLine += event.type 
600          else: 
601              eventLine += trigger.type + ":" + trigger.id 
602              if writeScores and event.trigger.unmergingScores != None: 
603                  eventLine += ":" + event.trigger.unmergingScores.replace(":", "=") 
604           
605          typeCounts = {} 
606           
607          targetProteins = set() 
608          for arg in event.arguments: 
609              argType = arg[0] 
610              if argType == "Target" and event.type == "Coref": 
611                  targetProteins.add(arg[1].id) 
612              else: 
613                  if not typeCounts.has_key(argType): 
614                      typeCounts[argType] = 0 
615                  typeCounts[argType] += 1 
616           
617           
618           
619           
620           
621          currTypeCounts = {} 
622          for key in typeCounts.keys(): 
623              currTypeCounts[key] = 0 
624          for arg in event.arguments: 
625              argType = arg[0] 
626              if argType == "Target" and event.type == "Coref": 
627                  continue 
628              assert arg[1].id != None, (event.id, event.arguments, arg) 
629              currTypeCounts[argType] += 1 
630              if typeCounts[argType] > 1: 
631                  eventLine += " " + argType + str(currTypeCounts[argType]) + ":" + arg[1].id 
632              else: 
633                  eventLine += " " + argType + ":" + arg[1].id 
634              if writeScores and len(arg) > 3 and arg[3] != None: 
635                  eventLine += ":" + arg[3].replace(":", "=") 
636               
637               
638              if arg[1].id[0] == "E": 
639                  nestedEvents.add(arg[1].id) 
640           
641           
642          currTypeCounts = {} 
643          for key in typeCounts.keys(): 
644              currTypeCounts[key] = 0 
645           
646          for arg in event.arguments: 
647              if task == 1: 
648                  continue 
649               
650              if arg[2] == None: 
651                  continue 
652               
653               
654               
655               
656               
657               
658               
659               
660               
661               
662               
663              argType = arg[0] 
664              if argType == "Target" and event.type == "Coref": 
665                  continue 
666              currTypeCounts[argType] += 1 
667               
668              sitePrefix = "" 
669              if argType.find("Cause") != -1: 
670                  sitePrefix = "C" 
671              if typeCounts[argType] > 1: 
672                  eventLine += " " + sitePrefix + "Site" + str(currTypeCounts[argType]) + ":" + arg[2].id 
673              else: 
674                  eventLine += " " + sitePrefix + "Site" + ":" + arg[2].id 
675              if writeScores and len(arg) > 4 and arg[4] != None: 
676                  eventLine += ":" + arg[4].replace(":", "=") 
677           
678           
679          if len(targetProteins) > 0: 
680              eventLine += "\t[" + ", ".join(sorted(list(targetProteins))) + "]" 
681           
682          eventLine += "\n" 
683           
684   
685           
686          if event.speculation != None: 
687              eventLine += "M" + str(mCounter) + "\t" + "Speculation " + str(event.id) 
688              if writeScores and event.trigger != None and event.trigger.speculationScores != None: 
689                  eventLine += "\t" + event.trigger.speculationScores.replace(":", "=") 
690              eventLine += "\n" 
691              mCounter += 1 
692          if event.negation != None: 
693              eventLine += "M" + str(mCounter) + "\t" + "Negation " + str(event.id) 
694              if writeScores and event.trigger != None and event.trigger.negationScores != None: 
695                  eventLine += "\t" + event.trigger.negationScores.replace(":", "=") 
696              eventLine += "\n" 
697              mCounter += 1 
698           
699          eventLines.append( [event.id, eventLine] ) 
700       
701       
702       
703       
704      for eventLineTuple in eventLines: 
705          out.write(eventLineTuple[0] + "\t" + eventLineTuple[1]) 
 706           
707   
708   
709   
710   
711   
712   
713       
714       
715       
716   
717 -def write(id, dir, proteins, triggers, events, relations, resultFileTag="a2", counts=None, debug=False, task=2, writeScores=False): 
 718      id = str(id) 
719      if debug: 
720          print id 
721      if not os.path.exists(dir): 
722          os.makedirs(dir) 
723       
724       
725       
726       
727       
728       
729      if proteins != None: 
730          out = codecs.open(os.path.join(dir, id + ".a1"), "wt", "utf-8") 
731          writeTAnnotation(proteins, out, False) 
732          out.close() 
733      resultFile = codecs.open(os.path.join(dir, id + "." + resultFileTag), "wt", "utf-8") 
734      writeTAnnotation(triggers, resultFile, False, getMaxId(proteins) + 1) 
735      if writeScores: 
736          resultScoresFile = codecs.open(os.path.join(dir, id + "." + resultFileTag + ".scores"), "wt", "utf-8") 
737          writeTAnnotation(triggers, resultScoresFile, True, getMaxId(proteins) + 1) 
738      if len(events) > 0: 
739          if debug: print >> sys.stderr, "Writing events" 
740          writeEvents(events, resultFile, counts, task, writeScores=False) 
741          if writeScores: 
742              writeEvents(events, resultScoresFile, counts, task, writeScores=True) 
743      if len(relations) > 0: 
744          if debug: print >> sys.stderr, "Writing relations" 
745          writeEvents(relations, resultFile, counts, task) 
746          if writeScores: 
747              writeEvents(relations, resultScoresFile, counts, task, writeScores=True) 
748      resultFile.close() 
749      if writeScores: 
750          resultScoresFile.close() 
 751   
752 -def package(sourceDir, outputFile, includeTags=["a2", "a2.scores"]): 
 753      import tarfile 
754      allFiles = os.listdir(sourceDir) 
755      tarFiles = [] 
756      for file in allFiles: 
757          for tag in includeTags: 
758              if file.endswith(tag): 
759                  tarFiles.append(file) 
760                  break 
761      packageFile = tarfile.open(outputFile, "w:gz") 
762      tempCwd = os.getcwd() 
763      os.chdir(sourceDir) 
764      for file in tarFiles: 
765          packageFile.add(file) 
766       
767       
768      os.chdir(tempCwd) 
769      packageFile.close() 
 770   
771  if __name__=="__main__": 
772      import sys 
773      from optparse import OptionParser 
774       
775      try: 
776          import psyco 
777          psyco.full() 
778          print >> sys.stderr, "Found Psyco, using" 
779      except ImportError: 
780          print >> sys.stderr, "Psyco not installed" 
781           
782      optparser = OptionParser(usage="%prog [options]\nST format input and output.") 
783      optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE") 
784      optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.") 
785      optparser.add_option("-t", "--outputTag", default="a2", dest="outputTag", help="a2 file extension.") 
786      optparser.add_option("-s", "--sentences", default=False, action="store_true", dest="sentences", help="Write each sentence to its own document") 
787      optparser.add_option("-r", "--origIds", default=False, action="store_true", dest="origIds", help="Use stored original ids (can cause problems with duplicates).") 
788      optparser.add_option("-a", "--task", default=2, type="int", dest="task", help="1 or 2") 
789      optparser.add_option("-d", "--debug", default=False, action="store_true", dest="debug", help="Verbose output.") 
790      (options, args) = optparser.parse_args() 
791       
792      assert options.input != options.output 
793      documents = loadSet(options.input, "GE", level="a2", sitesAreArguments=False, a2Tag="a2", readScores=False) 
794      writeSet(documents, options.output, resultFileTag=options.outputTag, debug=options.debug, task=options.task, validate=True, writeScores=False) 
795           
796   
797   
798   
799   
800   
801   
802   
803   
804   
805   
806   
807   
808   
809   
810   
811