Package TEES :: Package Utils :: Package Convert :: Module DDITools
[hide private]

Source Code for Module TEES.Utils.Convert.DDITools

  1  import sys, os 
  2  thisPath = os.path.dirname(os.path.abspath(__file__)) 
  3  sys.path.append(os.path.abspath(os.path.join(thisPath,"../.."))) 
  4  import Utils.ElementTreeUtils as ETUtils 
  5  from collections import defaultdict 
  6   
7 -def makeDDISubmissionFile(input, output):
8 xml = ETUtils.ETFromObj(input) 9 outFile = open(output, "wt") 10 for sentence in xml.getiterator("sentence"): 11 # First determine which pairs interact 12 intMap = defaultdict(lambda:defaultdict(lambda:None)) 13 for interaction in sentence.findall("interaction"): 14 # Make mapping both ways to discard edge directionality. This isn't actually needed, 15 # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function, 16 # but shouldn't harm to include it and now it works regardless of pair direction. 17 if interaction.get("type") != "neg": 18 intMap[interaction.get("e1")][interaction.get("e2")] = interaction 19 intMap[interaction.get("e2")][interaction.get("e1")] = interaction 20 # Then write all pairs to the output file 21 entities = sentence.findall("entity") 22 for i in range(0, len(entities)-1): 23 for j in range(i+1, len(entities)): 24 eIId = entities[i].get("id") 25 eJId = entities[j].get("id") 26 outFile.write(eIId + "\t" + eJId + "\t") 27 if intMap[eIId][eJId] != None: 28 outFile.write("1\n") 29 else: 30 outFile.write("0\n")
31
32 -def transferClassifications(input, rls, output):
33 assert os.path.exists(input), input 34 f = open(input, "rt") 35 inputLines = f.readlines() 36 f.close() 37 38 assert os.path.exists(rls), rls 39 f = open(rls, "rt") 40 rlsLines = f.readlines() 41 f.close() 42 43 outFile = open(output, "wt") 44 assert len(inputLines) == len(rlsLines), (len(inputLines), len(rlsLines)) 45 for inputLine, rlsLine in zip(inputLines, rlsLines): 46 outFile.write(inputLine.rsplit("\t", 1)[0] + "\t" + rlsLine) 47 outFile.close()
48
49 -def addMTMX(input, mtmxDir, output=None):
50 from collections import defaultdict 51 # read interaction XML 52 print "Reading interaction XML" 53 counts = defaultdict(int) 54 xml = ETUtils.ETFromObj(input).getroot() 55 docById = {} 56 for document in xml.getiterator("document"): 57 docId = document.get("origId") 58 assert docId not in docById 59 docById[docId] = document 60 counts["document"] += 1 61 for entity in xml.getiterator("entity"): 62 counts["entity"] += 1 63 64 # read MTMX files 65 print "Processing MTMX" 66 for filename in sorted(os.listdir(mtmxDir)): 67 if filename.endswith(".xml"): 68 print >> sys.stderr, filename, 69 fileId = filename.split("_")[0] 70 if fileId not in docById: 71 print >> sys.stderr, "skipped" 72 continue 73 else: 74 print >> sys.stderr, "processing" 75 doc = docById[fileId] 76 entityByOrigId = {} 77 for entity in doc.getiterator("entity"): 78 assert entity.get("origId") not in entityByOrigId, entity.get("origId") 79 entityByOrigId[entity.get("origId")] = entity 80 mtmx = ETUtils.ETFromObj(os.path.join(mtmxDir, filename)).getroot() 81 for phrase in mtmx.getiterator("PHRASE"): 82 if phrase.get("ID") in entityByOrigId: 83 entity = entityByOrigId[phrase.get("ID")] 84 mapCount = 0 85 for map in phrase.getiterator("MAP"): 86 if (map.get("NAME").lower() == entity.get("text").lower()) or (map.get("NAME_SHORT").lower() == entity.get("text").lower()): 87 if entity.get("mtmxProb") != None: 88 if int(entity.get("mtmxProb")) > int(map.get("PROB")): 89 break 90 else: 91 counts["mapped-multi"] += 1 92 counts["mapped-multi-"+str(mapCount)] += 1 93 #print filename, phrase.get("ID") 94 else: 95 counts["mapped-at-least-once"] += 1 96 entity.set("mtmxProb", str(map.get("PROB"))) 97 entity.set("mtmxCui", str(map.get("CUI"))) 98 entity.set("mtmxName", str(map.get("NAME"))) 99 entity.set("mtmxNameShort", str(map.get("NAME_SHORT"))) 100 entity.set("mtmxSemTypes", str(map.get("SEMTYPES"))) 101 counts["mappings"] += 1 102 mapCount += 1 103 print >> sys.stderr, counts 104 if output != None: 105 ETUtils.write(xml, output)
106 107 if __name__=="__main__": 108 # Import Psyco if available 109 try: 110 import psyco 111 psyco.full() 112 print >> sys.stderr, "Found Psyco, using" 113 except ImportError: 114 print >> sys.stderr, "Psyco not installed" 115 116 from optparse import OptionParser 117 optparser = OptionParser(description="Tools for the DDI'11 Shared Task") 118 optparser.add_option("-i", "--input", default=None, dest="input", help="input file (interaction XML)") 119 optparser.add_option("-o", "--output", default=None, dest="output", help="output file (txt file)") 120 optparser.add_option("-d", "--add", default=None, dest="add", help="data to be added, e.g. rls classifications") 121 optparser.add_option("-a", "--action", default=None, dest="action", help="") 122 (options, args) = optparser.parse_args() 123 assert options.action in ["SUBMISSION", "TRANSFER_RLS", "ADD_MTMX"] 124 125 if options.action == "SUBMISSION": 126 makeDDISubmissionFile(options.input, options.output) 127 elif options.action == "TRANSFER_RLS": 128 transferClassifications(options.input, options.add, options.output) 129 elif options.action == "ADD_MTMX": 130 addMTMX(options.input, options.add, options.output) 131 else: 132 assert False, options.action 133