1 import sys, os, copy
2 extraPath = os.path.dirname(os.path.abspath(__file__))+"/../../"
3 sys.path.append(extraPath)
4 from Utils.ProgressCounter import ProgressCounter
5 try:
6 import xml.etree.cElementTree as ET
7 except ImportError:
8 import cElementTree as ET
9 import Utils.ElementTreeUtils as ETUtils
10 import IDUtils
11
12
14 typeName = element.get("type")
15 if typeName.find(separator) != -1:
16 return typeName.split(separator)
17 else:
18 return [typeName]
19
21 elements = sentence.findall(elementName)
22 elementCount = len(elements)
23 newElements = []
24
25 removeCount = 0
26 newIdCount = IDUtils.getNextFreeId(elements)
27 for element in elements:
28 types = getElementTypes(element)
29 if len(types) > 1:
30 for type in types:
31
32 newElement = ET.Element(elementName)
33 for k,v in element.attrib.iteritems():
34 newElement.set(k, v)
35 newElement.set("type", type)
36 idSplits = element.get("id").rsplit(".",1)
37 newElement.set("id", idSplits[0] + "." + idSplits[1][0] + str(newIdCount) )
38 newIdCount += 1
39
40 newElements.append(newElement)
41 elementCount += 1
42 sentence.remove(element)
43 removeCount += 1
44
45 elements = sentence.findall(elementName)
46 if len(newElements) > 0:
47 insertPos = 0
48 if len(elements) > 0:
49 for element in sentence:
50 if element == elements[-1]:
51 break
52 insertPos += 1
53 for newElement in newElements:
54 sentence.insert(insertPos, newElement)
55
56 if countsByType != None:
57 countsByType[elementName][0] += removeCount
58 countsByType[elementName][1] += len(newElements)
59
60
65
67 print >> sys.stderr, "##### Split elements with merged types #####"
68 print >> sys.stderr, "Loading corpus", inputFilename
69 corpusTree = ETUtils.ETFromObj(inputFilename)
70 corpusRoot = corpusTree.getroot()
71
72 documents = corpusRoot.findall("document")
73 counter = ProgressCounter(len(documents), "Documents")
74 countsByType = {"entity":[0,0], "interaction":[0,0], "pair":[0,0]}
75 for document in documents:
76 counter.update()
77 for sentence in document.findall("sentence"):
78 processSentence(sentence, countsByType)
79 print >> sys.stderr, "Results"
80 for k in sorted(countsByType.keys()):
81 print >> sys.stderr, " " + k + ": removed", countsByType[k][0], "created", countsByType[k][1]
82
83 if outputFilename != None:
84 print >> sys.stderr, "Writing output to", outputFilename
85 ETUtils.write(corpusRoot, outputFilename)
86 return corpusTree
87
88 if __name__=="__main__":
89 import sys
90
91 from optparse import OptionParser
92
93 try:
94 import psyco
95 psyco.full()
96 print >> sys.stderr, "Found Psyco, using"
97 except ImportError:
98 print >> sys.stderr, "Psyco not installed"
99
100 optparser = OptionParser(usage="%prog [options]\nPath generator.")
101 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE")
102 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.")
103 (options, args) = optparser.parse_args()
104
105 if options.input == None:
106 print >> sys.stderr, "Error, input file not defined."
107 optparser.print_help()
108 sys.exit(1)
109 if options.output == None:
110 print >> sys.stderr, "Error, output file not defined."
111 optparser.print_help()
112 sys.exit(1)
113
114 run(options.input, options.output)
115