1 import sys, os
2 thisPath = os.path.dirname(os.path.abspath(__file__))
3 sys.path.append(os.path.abspath(os.path.join(thisPath,"../..")))
4 try:
5 import xml.etree.cElementTree as ET
6 except ImportError:
7 import cElementTree as ET
8 import Utils.ElementTreeUtils as ETUtils
9 import RecalculateIds
10
11 -def mixSets(input, output, docOrigIds, sourceSet, targetSet):
12 print >> sys.stderr, "Mixing Sets", input
13 corpusTree = ETUtils.ETFromObj(input)
14 corpusRoot = corpusTree.getroot()
15
16 if docOrigIds != None:
17 for document in corpusRoot.getiterator("document"):
18 docId = document.get("pmid")
19 if docId == None:
20 docId = document.get("origId")
21 if docId in docOrigIds:
22 assert document.get("set") == sourceSet
23 document.set("set", targetSet)
24 docOrigIds.remove(docId)
25 assert len(docOrigIds) == 0, docOrigIds
26
27 sentenceIds = None
28 if sentenceIds != None:
29 for document in corpusRoot.getiterator("document"):
30 removed = []
31 for sentence in document.findall("sentence"):
32 assert document.get("set") == sourceSet
33 sentenceId = sentence.get("id")
34 if sentenceId in sentenceIds:
35 removed.append(document.remove(sentence))
36 sentenceIds.remove(sentenceId)
37 if len(removed) > 0:
38 newDoc = ET.Element("document")
39 for attr in document.attrib:
40 newDoc.set(attr, document.get(attr))
41 newDoc.set("id", None)
42 newDoc.set("set", targetSet)
43 for sentence in removed:
44 newDoc.append(sentence)
45 corpusRoot.append(newDoc)
46 assert len(sentenceIds) == None
47
48 RecalculateIds.recalculateIds(corpusTree, onlyWithinSentence=False)
49
50 if output != None:
51 print >> sys.stderr, "Writing output to", output
52 ETUtils.write(corpusRoot, output)
53 return corpusTree
54