1 import sys, os
2 import codecs, time
3 from wikitools import wiki
4 from wikitools import api
5
6 -def loadPages(site, pages, outDir, wait=3):
7 count = 0
8 for page in pages:
9 print >> sys.stderr, "Loading page", page, "(" + str(count+1) +"/"+str(len(pages)) + ")"
10
11 params = {'action':'query', 'titles':page, 'export':None}
12
13 request = api.APIRequest(site, params)
14
15 result = request.query()
16
17 print >> sys.stderr, "Writing result"
18 f = codecs.open(os.path.join(outDir, page+".xml"), "wt", "utf-8")
19 f.write( result["query"]["export"]["*"] )
20 f.close()
21
22 print >> sys.stderr, "Sleeping"
23 time.sleep(wait)
24 count += 1
25
27 f = codecs.open(file, "rt", "utf-8")
28 names = []
29 for line in f:
30 names.append(line.strip())
31 return names
32
34 outFile = open(outFilename, "wt")
35 for file in sorted(os.listdir(inDir)):
36 if file.find("xml") != -1:
37 protName = file.split(".")[0]
38 outFile.write(protName)
39 f = codecs.open(os.path.join(inDir, file), "rt", "utf-8")
40 geneNameLine = False
41 for line in f:
42 if geneNameLine:
43 if line.find(protName) == -1:
44 print "Warning,", protName, "not found"
45 geneNameLine = False
46 if line.find("||") == -1:
47 continue
48 if line.find("Gene Name") != -1:
49 geneNameLine = True
50 elif line.find("Synonyms") != -1:
51 synString = line.split("||")[1]
52 synString = synString.replace("''", "")
53 synonyms = synString.split(",")
54 for i in range(len(synonyms)):
55
56 synonym = synonyms[i].strip()
57 if synonym != "":
58 outFile.write(","+synonym)
59 outFile.write("\n")
60
61 if __name__=="__main__":
62 import sys
63 from optparse import OptionParser
64
65 try:
66 import psyco
67 psyco.full()
68 print >> sys.stderr, "Found Psyco, using"
69 except ImportError:
70 print >> sys.stderr, "Psyco not installed"
71
72 optparser = OptionParser(description="Extract information from SubtiWiki for the BioNLP'11 REN task")
73 optparser.add_option("-i", "--input", default="http://subtiwiki.uni-goettingen.de/wiki/api.php", dest="input", help="")
74 optparser.add_option("-o", "--output", default="subtiwiki/pages", dest="output", help="")
75 optparser.add_option("-n", "--names", default="subtiwiki/Subtiwiki-Protein-Coding-Genes-Names.txt", dest="names", help="")
76 optparser.add_option("-s", "--synonyms", default="subtiwiki/Subtiwiki-Synonyms.csv", dest="synonyms", help="")
77 (options, args) = optparser.parse_args()
78
79
80 site = wiki.Wiki(options.input)
81 loadPages(site, readProteinNames(options.names), options.output, wait=5)
82 getSynonyms(options.output, options.synonyms)
83