Package TEES :: Package Utils :: Package Libraries :: Module PorterStemmer
[hide private]

Source Code for Module TEES.Utils.Libraries.PorterStemmer

  1  #!/usr/bin/env python 
  2   
  3  """Porter Stemming Algorithm 
  4  This is the Porter stemming algorithm, ported to Python from the 
  5  version coded up in ANSI C by the author. It may be be regarded 
  6  as canonical, in that it follows the algorithm presented in 
  7   
  8  Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, 
  9  no. 3, pp 130-137, 
 10   
 11  only differing from it at the points maked --DEPARTURE-- below. 
 12   
 13  See also http://www.tartarus.org/~martin/PorterStemmer 
 14   
 15  The algorithm as described in the paper could be exactly replicated 
 16  by adjusting the points of DEPARTURE, but this is barely necessary, 
 17  because (a) the points of DEPARTURE are definitely improvements, and 
 18  (b) no encoding of the Porter stemmer I have seen is anything like 
 19  as exact as this version, even with the points of DEPARTURE! 
 20   
 21  Vivake Gupta (v@nano.com) 
 22   
 23  Modified by Jari Bjorne (added module-level stem-function) 
 24   
 25  Release 1: January 2001 
 26  """ 
 27   
 28  import sys 
 29   
 30  # A unique global instance of the PorterStemmer class, so that 
 31  # a new object does not need to be created every time something 
 32  # needs to be stemmed. 
 33  porterStemmerSingleton = None 
 34   
35 -class PorterStemmer:
36
37 - def __init__(self):
38 """The main part of the stemming algorithm starts here. 39 b is a buffer holding a word to be stemmed. The letters are in b[k0], 40 b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is 41 readjusted downwards as the stemming progresses. Zero termination is 42 not in fact used in the algorithm. 43 44 Note that only lower case sequences are stemmed. Forcing to lower case 45 should be done before stem(...) is called. 46 """ 47 48 self.b = "" # buffer for word to be stemmed 49 self.k = 0 50 self.k0 = 0 51 self.j = 0 # j is a general offset into the string
52
53 - def cons(self, i):
54 """cons(i) is TRUE <=> b[i] is a consonant.""" 55 if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u': 56 return 0 57 if self.b[i] == 'y': 58 if i == self.k0: 59 return 1 60 else: 61 return (not self.cons(i - 1)) 62 return 1
63
64 - def m(self):
65 """m() measures the number of consonant sequences between k0 and j. 66 if c is a consonant sequence and v a vowel sequence, and <..> 67 indicates arbitrary presence, 68 69 <c><v> gives 0 70 <c>vc<v> gives 1 71 <c>vcvc<v> gives 2 72 <c>vcvcvc<v> gives 3 73 .... 74 """ 75 n = 0 76 i = self.k0 77 while 1: 78 if i > self.j: 79 return n 80 if not self.cons(i): 81 break 82 i = i + 1 83 i = i + 1 84 while 1: 85 while 1: 86 if i > self.j: 87 return n 88 if self.cons(i): 89 break 90 i = i + 1 91 i = i + 1 92 n = n + 1 93 while 1: 94 if i > self.j: 95 return n 96 if not self.cons(i): 97 break 98 i = i + 1 99 i = i + 1
100
101 - def vowelinstem(self):
102 """vowelinstem() is TRUE <=> k0,...j contains a vowel""" 103 for i in range(self.k0, self.j + 1): 104 if not self.cons(i): 105 return 1 106 return 0
107
108 - def doublec(self, j):
109 """doublec(j) is TRUE <=> j,(j-1) contain a double consonant.""" 110 if j < (self.k0 + 1): 111 return 0 112 if (self.b[j] != self.b[j-1]): 113 return 0 114 return self.cons(j)
115
116 - def cvc(self, i):
117 """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant 118 and also if the second c is not w,x or y. this is used when trying to 119 restore an e at the end of a short e.g. 120 121 cav(e), lov(e), hop(e), crim(e), but 122 snow, box, tray. 123 """ 124 if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2): 125 return 0 126 ch = self.b[i] 127 if ch == 'w' or ch == 'x' or ch == 'y': 128 return 0 129 return 1
130
131 - def ends(self, s):
132 """ends(s) is TRUE <=> k0,...k ends with the string s.""" 133 length = len(s) 134 if s[length - 1] != self.b[self.k]: # tiny speed-up 135 return 0 136 if length > (self.k - self.k0 + 1): 137 return 0 138 if self.b[self.k-length+1:self.k+1] != s: 139 return 0 140 self.j = self.k - length 141 return 1
142
143 - def setto(self, s):
144 """setto(s) sets (j+1),...k to the characters in the string s, readjusting k.""" 145 length = len(s) 146 self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:] 147 self.k = self.j + length
148
149 - def r(self, s):
150 """r(s) is used further down.""" 151 if self.m() > 0: 152 self.setto(s)
153
154 - def step1ab(self):
155 """step1ab() gets rid of plurals and -ed or -ing. e.g. 156 157 caresses -> caress 158 ponies -> poni 159 ties -> ti 160 caress -> caress 161 cats -> cat 162 163 feed -> feed 164 agreed -> agree 165 disabled -> disable 166 167 matting -> mat 168 mating -> mate 169 meeting -> meet 170 milling -> mill 171 messing -> mess 172 173 meetings -> meet 174 """ 175 if self.b[self.k] == 's': 176 if self.ends("sses"): 177 self.k = self.k - 2 178 elif self.ends("ies"): 179 self.setto("i") 180 elif self.b[self.k - 1] != 's': 181 self.k = self.k - 1 182 if self.ends("eed"): 183 if self.m() > 0: 184 self.k = self.k - 1 185 elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem(): 186 self.k = self.j 187 if self.ends("at"): self.setto("ate") 188 elif self.ends("bl"): self.setto("ble") 189 elif self.ends("iz"): self.setto("ize") 190 elif self.doublec(self.k): 191 self.k = self.k - 1 192 ch = self.b[self.k] 193 if ch == 'l' or ch == 's' or ch == 'z': 194 self.k = self.k + 1 195 elif (self.m() == 1 and self.cvc(self.k)): 196 self.setto("e")
197
198 - def step1c(self):
199 """step1c() turns terminal y to i when there is another vowel in the stem.""" 200 if (self.ends("y") and self.vowelinstem()): 201 self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
202
203 - def step2(self):
204 """step2() maps double suffices to single ones. 205 so -ization ( = -ize plus -ation) maps to -ize etc. note that the 206 string before the suffix must give m() > 0. 207 """ 208 if self.b[self.k - 1] == 'a': 209 if self.ends("ational"): self.r("ate") 210 elif self.ends("tional"): self.r("tion") 211 elif self.b[self.k - 1] == 'c': 212 if self.ends("enci"): self.r("ence") 213 elif self.ends("anci"): self.r("ance") 214 elif self.b[self.k - 1] == 'e': 215 if self.ends("izer"): self.r("ize") 216 elif self.b[self.k - 1] == 'l': 217 if self.ends("bli"): self.r("ble") # --DEPARTURE-- 218 # To match the published algorithm, replace this phrase with 219 # if self.ends("abli"): self.r("able") 220 elif self.ends("alli"): self.r("al") 221 elif self.ends("entli"): self.r("ent") 222 elif self.ends("eli"): self.r("e") 223 elif self.ends("ousli"): self.r("ous") 224 elif self.b[self.k - 1] == 'o': 225 if self.ends("ization"): self.r("ize") 226 elif self.ends("ation"): self.r("ate") 227 elif self.ends("ator"): self.r("ate") 228 elif self.b[self.k - 1] == 's': 229 if self.ends("alism"): self.r("al") 230 elif self.ends("iveness"): self.r("ive") 231 elif self.ends("fulness"): self.r("ful") 232 elif self.ends("ousness"): self.r("ous") 233 elif self.b[self.k - 1] == 't': 234 if self.ends("aliti"): self.r("al") 235 elif self.ends("iviti"): self.r("ive") 236 elif self.ends("biliti"): self.r("ble") 237 elif self.b[self.k - 1] == 'g': # --DEPARTURE-- 238 if self.ends("logi"): self.r("log")
239 # To match the published algorithm, delete this phrase 240
241 - def step3(self):
242 """step3() dels with -ic-, -full, -ness etc. similar strategy to step2.""" 243 if self.b[self.k] == 'e': 244 if self.ends("icate"): self.r("ic") 245 elif self.ends("ative"): self.r("") 246 elif self.ends("alize"): self.r("al") 247 elif self.b[self.k] == 'i': 248 if self.ends("iciti"): self.r("ic") 249 elif self.b[self.k] == 'l': 250 if self.ends("ical"): self.r("ic") 251 elif self.ends("ful"): self.r("") 252 elif self.b[self.k] == 's': 253 if self.ends("ness"): self.r("")
254
255 - def step4(self):
256 """step4() takes off -ant, -ence etc., in context <c>vcvc<v>.""" 257 if self.b[self.k - 1] == 'a': 258 if self.ends("al"): pass 259 else: return 260 elif self.b[self.k - 1] == 'c': 261 if self.ends("ance"): pass 262 elif self.ends("ence"): pass 263 else: return 264 elif self.b[self.k - 1] == 'e': 265 if self.ends("er"): pass 266 else: return 267 elif self.b[self.k - 1] == 'i': 268 if self.ends("ic"): pass 269 else: return 270 elif self.b[self.k - 1] == 'l': 271 if self.ends("able"): pass 272 elif self.ends("ible"): pass 273 else: return 274 elif self.b[self.k - 1] == 'n': 275 if self.ends("ant"): pass 276 elif self.ends("ement"): pass 277 elif self.ends("ment"): pass 278 elif self.ends("ent"): pass 279 else: return 280 elif self.b[self.k - 1] == 'o': 281 if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass 282 elif self.ends("ou"): pass 283 # takes care of -ous 284 else: return 285 elif self.b[self.k - 1] == 's': 286 if self.ends("ism"): pass 287 else: return 288 elif self.b[self.k - 1] == 't': 289 if self.ends("ate"): pass 290 elif self.ends("iti"): pass 291 else: return 292 elif self.b[self.k - 1] == 'u': 293 if self.ends("ous"): pass 294 else: return 295 elif self.b[self.k - 1] == 'v': 296 if self.ends("ive"): pass 297 else: return 298 elif self.b[self.k - 1] == 'z': 299 if self.ends("ize"): pass 300 else: return 301 else: 302 return 303 if self.m() > 1: 304 self.k = self.j
305
306 - def step5(self):
307 """step5() removes a final -e if m() > 1, and changes -ll to -l if 308 m() > 1. 309 """ 310 self.j = self.k 311 if self.b[self.k] == 'e': 312 a = self.m() 313 if a > 1 or (a == 1 and not self.cvc(self.k-1)): 314 self.k = self.k - 1 315 if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1: 316 self.k = self.k -1
317
318 - def stem(self, p, i, j):
319 """In stem(p,i,j), p is a char pointer, and the string to be stemmed 320 is from p[i] to p[j] inclusive. Typically i is zero and j is the 321 offset to the last character of a string, (p[j+1] == '\0'). The 322 stemmer adjusts the characters p[i] ... p[j] and returns the new 323 end-point of the string, k. Stemming never increases word length, so 324 i <= k <= j. To turn the stemmer into a module, declare 'stem' as 325 extern, and delete the remainder of this file. 326 """ 327 # copy the parameters into statics 328 self.b = p 329 self.k = j 330 self.k0 = i 331 if self.k <= self.k0 + 1: 332 return self.b # --DEPARTURE-- 333 334 # With this line, strings of length 1 or 2 don't go through the 335 # stemming process, although no mention is made of this in the 336 # published algorithm. Remove the line to match the published 337 # algorithm. 338 339 self.step1ab() 340 self.step1c() 341 self.step2() 342 self.step3() 343 self.step4() 344 self.step5() 345 return self.b[self.k0:self.k+1]
346
347 -def stem(word, startPos=None, endPos=None):
348 """ Get the stem of a word. 349 350 Returns the stem of the word. Can optionally stem only a substring 351 of the word argument, if startPos and endPos are defined. 352 353 Keyword arguments: 354 word (string): the word to be stemmed 355 startPos (int): start index of the section to stem in word 356 endPos (int): end index of the section to stem in word 357 Return value (string): 358 The stem 359 """ 360 global porterStemmerSingleton 361 if porterStemmerSingleton == None: 362 porterStemmerSingleton = PorterStemmer() 363 364 if startPos == None: 365 startPos = 0 366 if endPos == None: 367 endPos = len(word) - 1 368 return porterStemmerSingleton.stem(word, startPos, endPos)
369 370 if __name__ == '__main__': 371 p = PorterStemmer() 372 if len(sys.argv) > 1: 373 for f in sys.argv[1:]: 374 infile = open(f, 'r') 375 while 1: 376 w = infile.readline() 377 if w == '': 378 break 379 w = w[:-1] 380 print p.stem(w, 0,len(w)-1) 381