# -*- coding: utf-8 -*- import time import os import re import urllib3 from bs4 import BeautifulSoup import MeCab import matplotlib.pyplot as plt from wordcloud import WordCloud class YearWordVisualizer(): def __init__(self, blog_url, year): self.blog_url = blog_url self.year = year def run(self): self.crawl_articles() self.preprocess() self.visualize() def crawl_articles(self): http = urllib3.PoolManager() for month in range(1, 13): time.sleep(1) url = '{blog}/{year}/{month:02d}'.format(blog=self.blog_url, year=self.year, month=month) r = http.request('GET', url) open('html/{month:02d}.html'.format(month=month), 'w').write(r.data) def preprocess(self): cnt = 0 for f_name in os.listdir('html'): with open('html/'+f_name, 'r') as f: html = f.read() soup = BeautifulSoup(html, 'lxml') for pre in soup.find_all(['pre', 'img']): pre.decompose() for post in soup.find_all('div', class_='post-body'): text = post.getText() nouns = self.get_noun_list(text.encode('utf-8')) doc = " ".join(self.remove_stop_words(nouns)) cnt += 1 output_file = "{dir}/{cnt}.txt".format(dir='text', cnt=cnt) open(output_file, 'w').write(doc) def get_noun_list(self, text): tagger = MeCab.Tagger('-Ochasen') node = tagger.parseToNode(text) res = [] while node: if node.feature.split(',')[0] == '名詞': res.append(node.surface) node = node.next return res def remove_stop_words(self, words): words = [w for w in words if not w.isdigit()] words = [w for w in words if not (w.isalpha and len(w) == 1)] pattern = re.compile(r'^[+-/%\^\[\]\(\)\{\}「」()[]。、]+$') words = [w for w in words if not re.match(pattern, w) ] words = [w for w in words if w not in ['こと', 'これ', 'それ', 'もの', 'の', 'よう', 'とき', '以下', '場合', '結果']] return words def visualize(self): text = '' for f_name in os.listdir('text'): text += ' ' text += open('text/'+f_name, 'r').read() font_path = '/System/Library/Fonts/ヒラギノ明朝 ProN W3.otf' wordcloud = WordCloud(font_path=font_path, width=900, height=500).generate(text.decode('utf-8')) plt.imshow(wordcloud) plt.axis("off") plt.show() if __name__ == '__main__': visualizer = YearWordVisualizer('http://techtipshoge.blogspot.jp', 2015) visualizer.run()
0 件のコメント: