ウェブ上のNYTの記事を読み込んで、見出し語を獲得してみる

from __future__ import division
import nltk, re, pprint
from nltk.book import *
from urllib import urlopen
url="http://topics.nytimes.com/top/news/international/countriesandterritories/japan/index.html"
wnl=nltk.WordNetLemmatizer()

html = urlopen(url).read()
raw = nltk.clean_html(html)
tokens = nltk.word_tokenize(raw)
voc = sorted(set([w.lower() for w in tokens if w.isalpha()]))
vcl = sorted(set([wnl.lemmatize(v) for v in voc]))

結果は以下のとおり。

>>> vcl[:30]
['a', 'abandon', 'abduction', 'abe', 'about', 'above', 'abroad', 'abrupt', 'abruptly', 
'absorb', 'accident', 'according', 'account', 'accumulate', 'acknowledge'
, 'acknowledgement', 'acknowledgment', 'acquisition', 'across', 'act', 'action',
 'active', 'actively', 'activist', 'activity', 'ad', 'adapt', 'add', 'added', 'adding']
>>> len(vcl)
2251
>>>