Package TEES :: Package Utils :: Package InteractionXML :: Module MixSets
[hide private]

Source Code for Module TEES.Utils.InteractionXML.MixSets

 1  import sys, os 
 2  thisPath = os.path.dirname(os.path.abspath(__file__)) 
 3  sys.path.append(os.path.abspath(os.path.join(thisPath,"../.."))) 
 4  try: 
 5      import xml.etree.cElementTree as ET 
 6  except ImportError: 
 7      import cElementTree as ET 
 8  import Utils.ElementTreeUtils as ETUtils 
 9  import RecalculateIds 
10   
11 -def mixSets(input, output, docOrigIds, sourceSet, targetSet):
12 print >> sys.stderr, "Mixing Sets", input 13 corpusTree = ETUtils.ETFromObj(input) 14 corpusRoot = corpusTree.getroot() 15 16 if docOrigIds != None: 17 for document in corpusRoot.getiterator("document"): 18 docId = document.get("pmid") 19 if docId == None: 20 docId = document.get("origId") 21 if docId in docOrigIds: 22 assert document.get("set") == sourceSet 23 document.set("set", targetSet) 24 docOrigIds.remove(docId) 25 assert len(docOrigIds) == 0, docOrigIds 26 27 sentenceIds = None 28 if sentenceIds != None: 29 for document in corpusRoot.getiterator("document"): 30 removed = [] 31 for sentence in document.findall("sentence"): 32 assert document.get("set") == sourceSet 33 sentenceId = sentence.get("id") 34 if sentenceId in sentenceIds: 35 removed.append(document.remove(sentence)) 36 sentenceIds.remove(sentenceId) 37 if len(removed) > 0: 38 newDoc = ET.Element("document") 39 for attr in document.attrib: 40 newDoc.set(attr, document.get(attr)) 41 newDoc.set("id", None) 42 newDoc.set("set", targetSet) 43 for sentence in removed: 44 newDoc.append(sentence) 45 corpusRoot.append(newDoc) 46 assert len(sentenceIds) == None 47 48 RecalculateIds.recalculateIds(corpusTree, onlyWithinSentence=False) 49 50 if output != None: 51 print >> sys.stderr, "Writing output to", output 52 ETUtils.write(corpusRoot, output) 53 return corpusTree
54