1
2
3 """Porter Stemming Algorithm
4 This is the Porter stemming algorithm, ported to Python from the
5 version coded up in ANSI C by the author. It may be be regarded
6 as canonical, in that it follows the algorithm presented in
7
8 Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
9 no. 3, pp 130-137,
10
11 only differing from it at the points maked --DEPARTURE-- below.
12
13 See also http://www.tartarus.org/~martin/PorterStemmer
14
15 The algorithm as described in the paper could be exactly replicated
16 by adjusting the points of DEPARTURE, but this is barely necessary,
17 because (a) the points of DEPARTURE are definitely improvements, and
18 (b) no encoding of the Porter stemmer I have seen is anything like
19 as exact as this version, even with the points of DEPARTURE!
20
21 Vivake Gupta (v@nano.com)
22
23 Modified by Jari Bjorne (added module-level stem-function)
24
25 Release 1: January 2001
26 """
27
28 import sys
29
30
31
32
33 porterStemmerSingleton = None
34
36
38 """The main part of the stemming algorithm starts here.
39 b is a buffer holding a word to be stemmed. The letters are in b[k0],
40 b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
41 readjusted downwards as the stemming progresses. Zero termination is
42 not in fact used in the algorithm.
43
44 Note that only lower case sequences are stemmed. Forcing to lower case
45 should be done before stem(...) is called.
46 """
47
48 self.b = ""
49 self.k = 0
50 self.k0 = 0
51 self.j = 0
52
54 """cons(i) is TRUE <=> b[i] is a consonant."""
55 if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u':
56 return 0
57 if self.b[i] == 'y':
58 if i == self.k0:
59 return 1
60 else:
61 return (not self.cons(i - 1))
62 return 1
63
65 """m() measures the number of consonant sequences between k0 and j.
66 if c is a consonant sequence and v a vowel sequence, and <..>
67 indicates arbitrary presence,
68
69 <c><v> gives 0
70 <c>vc<v> gives 1
71 <c>vcvc<v> gives 2
72 <c>vcvcvc<v> gives 3
73 ....
74 """
75 n = 0
76 i = self.k0
77 while 1:
78 if i > self.j:
79 return n
80 if not self.cons(i):
81 break
82 i = i + 1
83 i = i + 1
84 while 1:
85 while 1:
86 if i > self.j:
87 return n
88 if self.cons(i):
89 break
90 i = i + 1
91 i = i + 1
92 n = n + 1
93 while 1:
94 if i > self.j:
95 return n
96 if not self.cons(i):
97 break
98 i = i + 1
99 i = i + 1
100
102 """vowelinstem() is TRUE <=> k0,...j contains a vowel"""
103 for i in range(self.k0, self.j + 1):
104 if not self.cons(i):
105 return 1
106 return 0
107
109 """doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
110 if j < (self.k0 + 1):
111 return 0
112 if (self.b[j] != self.b[j-1]):
113 return 0
114 return self.cons(j)
115
117 """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
118 and also if the second c is not w,x or y. this is used when trying to
119 restore an e at the end of a short e.g.
120
121 cav(e), lov(e), hop(e), crim(e), but
122 snow, box, tray.
123 """
124 if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2):
125 return 0
126 ch = self.b[i]
127 if ch == 'w' or ch == 'x' or ch == 'y':
128 return 0
129 return 1
130
132 """ends(s) is TRUE <=> k0,...k ends with the string s."""
133 length = len(s)
134 if s[length - 1] != self.b[self.k]:
135 return 0
136 if length > (self.k - self.k0 + 1):
137 return 0
138 if self.b[self.k-length+1:self.k+1] != s:
139 return 0
140 self.j = self.k - length
141 return 1
142
144 """setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
145 length = len(s)
146 self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:]
147 self.k = self.j + length
148
150 """r(s) is used further down."""
151 if self.m() > 0:
152 self.setto(s)
153
155 """step1ab() gets rid of plurals and -ed or -ing. e.g.
156
157 caresses -> caress
158 ponies -> poni
159 ties -> ti
160 caress -> caress
161 cats -> cat
162
163 feed -> feed
164 agreed -> agree
165 disabled -> disable
166
167 matting -> mat
168 mating -> mate
169 meeting -> meet
170 milling -> mill
171 messing -> mess
172
173 meetings -> meet
174 """
175 if self.b[self.k] == 's':
176 if self.ends("sses"):
177 self.k = self.k - 2
178 elif self.ends("ies"):
179 self.setto("i")
180 elif self.b[self.k - 1] != 's':
181 self.k = self.k - 1
182 if self.ends("eed"):
183 if self.m() > 0:
184 self.k = self.k - 1
185 elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
186 self.k = self.j
187 if self.ends("at"): self.setto("ate")
188 elif self.ends("bl"): self.setto("ble")
189 elif self.ends("iz"): self.setto("ize")
190 elif self.doublec(self.k):
191 self.k = self.k - 1
192 ch = self.b[self.k]
193 if ch == 'l' or ch == 's' or ch == 'z':
194 self.k = self.k + 1
195 elif (self.m() == 1 and self.cvc(self.k)):
196 self.setto("e")
197
199 """step1c() turns terminal y to i when there is another vowel in the stem."""
200 if (self.ends("y") and self.vowelinstem()):
201 self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
202
204 """step2() maps double suffices to single ones.
205 so -ization ( = -ize plus -ation) maps to -ize etc. note that the
206 string before the suffix must give m() > 0.
207 """
208 if self.b[self.k - 1] == 'a':
209 if self.ends("ational"): self.r("ate")
210 elif self.ends("tional"): self.r("tion")
211 elif self.b[self.k - 1] == 'c':
212 if self.ends("enci"): self.r("ence")
213 elif self.ends("anci"): self.r("ance")
214 elif self.b[self.k - 1] == 'e':
215 if self.ends("izer"): self.r("ize")
216 elif self.b[self.k - 1] == 'l':
217 if self.ends("bli"): self.r("ble")
218
219
220 elif self.ends("alli"): self.r("al")
221 elif self.ends("entli"): self.r("ent")
222 elif self.ends("eli"): self.r("e")
223 elif self.ends("ousli"): self.r("ous")
224 elif self.b[self.k - 1] == 'o':
225 if self.ends("ization"): self.r("ize")
226 elif self.ends("ation"): self.r("ate")
227 elif self.ends("ator"): self.r("ate")
228 elif self.b[self.k - 1] == 's':
229 if self.ends("alism"): self.r("al")
230 elif self.ends("iveness"): self.r("ive")
231 elif self.ends("fulness"): self.r("ful")
232 elif self.ends("ousness"): self.r("ous")
233 elif self.b[self.k - 1] == 't':
234 if self.ends("aliti"): self.r("al")
235 elif self.ends("iviti"): self.r("ive")
236 elif self.ends("biliti"): self.r("ble")
237 elif self.b[self.k - 1] == 'g':
238 if self.ends("logi"): self.r("log")
239
240
242 """step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
243 if self.b[self.k] == 'e':
244 if self.ends("icate"): self.r("ic")
245 elif self.ends("ative"): self.r("")
246 elif self.ends("alize"): self.r("al")
247 elif self.b[self.k] == 'i':
248 if self.ends("iciti"): self.r("ic")
249 elif self.b[self.k] == 'l':
250 if self.ends("ical"): self.r("ic")
251 elif self.ends("ful"): self.r("")
252 elif self.b[self.k] == 's':
253 if self.ends("ness"): self.r("")
254
256 """step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
257 if self.b[self.k - 1] == 'a':
258 if self.ends("al"): pass
259 else: return
260 elif self.b[self.k - 1] == 'c':
261 if self.ends("ance"): pass
262 elif self.ends("ence"): pass
263 else: return
264 elif self.b[self.k - 1] == 'e':
265 if self.ends("er"): pass
266 else: return
267 elif self.b[self.k - 1] == 'i':
268 if self.ends("ic"): pass
269 else: return
270 elif self.b[self.k - 1] == 'l':
271 if self.ends("able"): pass
272 elif self.ends("ible"): pass
273 else: return
274 elif self.b[self.k - 1] == 'n':
275 if self.ends("ant"): pass
276 elif self.ends("ement"): pass
277 elif self.ends("ment"): pass
278 elif self.ends("ent"): pass
279 else: return
280 elif self.b[self.k - 1] == 'o':
281 if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass
282 elif self.ends("ou"): pass
283
284 else: return
285 elif self.b[self.k - 1] == 's':
286 if self.ends("ism"): pass
287 else: return
288 elif self.b[self.k - 1] == 't':
289 if self.ends("ate"): pass
290 elif self.ends("iti"): pass
291 else: return
292 elif self.b[self.k - 1] == 'u':
293 if self.ends("ous"): pass
294 else: return
295 elif self.b[self.k - 1] == 'v':
296 if self.ends("ive"): pass
297 else: return
298 elif self.b[self.k - 1] == 'z':
299 if self.ends("ize"): pass
300 else: return
301 else:
302 return
303 if self.m() > 1:
304 self.k = self.j
305
307 """step5() removes a final -e if m() > 1, and changes -ll to -l if
308 m() > 1.
309 """
310 self.j = self.k
311 if self.b[self.k] == 'e':
312 a = self.m()
313 if a > 1 or (a == 1 and not self.cvc(self.k-1)):
314 self.k = self.k - 1
315 if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
316 self.k = self.k -1
317
318 - def stem(self, p, i, j):
319 """In stem(p,i,j), p is a char pointer, and the string to be stemmed
320 is from p[i] to p[j] inclusive. Typically i is zero and j is the
321 offset to the last character of a string, (p[j+1] == '\0'). The
322 stemmer adjusts the characters p[i] ... p[j] and returns the new
323 end-point of the string, k. Stemming never increases word length, so
324 i <= k <= j. To turn the stemmer into a module, declare 'stem' as
325 extern, and delete the remainder of this file.
326 """
327
328 self.b = p
329 self.k = j
330 self.k0 = i
331 if self.k <= self.k0 + 1:
332 return self.b
333
334
335
336
337
338
339 self.step1ab()
340 self.step1c()
341 self.step2()
342 self.step3()
343 self.step4()
344 self.step5()
345 return self.b[self.k0:self.k+1]
346
347 -def stem(word, startPos=None, endPos=None):
348 """ Get the stem of a word.
349
350 Returns the stem of the word. Can optionally stem only a substring
351 of the word argument, if startPos and endPos are defined.
352
353 Keyword arguments:
354 word (string): the word to be stemmed
355 startPos (int): start index of the section to stem in word
356 endPos (int): end index of the section to stem in word
357 Return value (string):
358 The stem
359 """
360 global porterStemmerSingleton
361 if porterStemmerSingleton == None:
362 porterStemmerSingleton = PorterStemmer()
363
364 if startPos == None:
365 startPos = 0
366 if endPos == None:
367 endPos = len(word) - 1
368 return porterStemmerSingleton.stem(word, startPos, endPos)
369
370 if __name__ == '__main__':
371 p = PorterStemmer()
372 if len(sys.argv) > 1:
373 for f in sys.argv[1:]:
374 infile = open(f, 'r')
375 while 1:
376 w = infile.readline()
377 if w == '':
378 break
379 w = w[:-1]
380 print p.stem(w, 0,len(w)-1)
381