Package TEES :: Package Utils :: Package Convert :: Module SubtiWiki
[hide private]

Source Code for Module TEES.Utils.Convert.SubtiWiki

 1  import sys, os 
 2  import codecs, time 
 3  from wikitools import wiki 
 4  from wikitools import api 
 5   
6 -def loadPages(site, pages, outDir, wait=3):
7 count = 0 8 for page in pages: 9 print >> sys.stderr, "Loading page", page, "(" + str(count+1) +"/"+str(len(pages)) + ")" 10 # define the params for the query 11 params = {'action':'query', 'titles':page, 'export':None} 12 # create the request object 13 request = api.APIRequest(site, params) 14 # query the API 15 result = request.query() 16 17 print >> sys.stderr, "Writing result" 18 f = codecs.open(os.path.join(outDir, page+".xml"), "wt", "utf-8") 19 f.write( result["query"]["export"]["*"] ) 20 f.close() 21 22 print >> sys.stderr, "Sleeping" 23 time.sleep(wait) 24 count += 1
25
26 -def readProteinNames(file):
27 f = codecs.open(file, "rt", "utf-8") 28 names = [] 29 for line in f: 30 names.append(line.strip()) 31 return names
32
33 -def getSynonyms(inDir, outFilename):
34 outFile = open(outFilename, "wt") 35 for file in sorted(os.listdir(inDir)): 36 if file.find("xml") != -1: 37 protName = file.split(".")[0] 38 outFile.write(protName) 39 f = codecs.open(os.path.join(inDir, file), "rt", "utf-8") 40 geneNameLine = False 41 for line in f: 42 if geneNameLine: 43 if line.find(protName) == -1: 44 print "Warning,", protName, "not found" 45 geneNameLine = False 46 if line.find("||") == -1: # not table line 47 continue 48 if line.find("Gene Name") != -1: 49 geneNameLine = True 50 elif line.find("Synonyms") != -1: 51 synString = line.split("||")[1] 52 synString = synString.replace("''", "") 53 synonyms = synString.split(",") 54 for i in range(len(synonyms)): 55 #synonyms[i] = synonyms[i].strip() 56 synonym = synonyms[i].strip() 57 if synonym != "": 58 outFile.write(","+synonym) 59 outFile.write("\n")
60 61 if __name__=="__main__": 62 import sys 63 from optparse import OptionParser 64 # Import Psyco if available 65 try: 66 import psyco 67 psyco.full() 68 print >> sys.stderr, "Found Psyco, using" 69 except ImportError: 70 print >> sys.stderr, "Psyco not installed" 71 72 optparser = OptionParser(description="Extract information from SubtiWiki for the BioNLP'11 REN task") 73 optparser.add_option("-i", "--input", default="http://subtiwiki.uni-goettingen.de/wiki/api.php", dest="input", help="") 74 optparser.add_option("-o", "--output", default="subtiwiki/pages", dest="output", help="") 75 optparser.add_option("-n", "--names", default="subtiwiki/Subtiwiki-Protein-Coding-Genes-Names.txt", dest="names", help="") 76 optparser.add_option("-s", "--synonyms", default="subtiwiki/Subtiwiki-Synonyms.csv", dest="synonyms", help="") 77 (options, args) = optparser.parse_args() 78 79 # create a Wiki object 80 site = wiki.Wiki(options.input) 81 loadPages(site, readProteinNames(options.names), options.output, wait=5) 82 getSynonyms(options.output, options.synonyms) 83