diff --git a/get_rmrbtop10.py b/get_rmrbtop10.py new file mode 100644 index 0000000..8ba93b0 --- /dev/null +++ b/get_rmrbtop10.py @@ -0,0 +1,71 @@ +# !/usr/bin/env python +# -*- coding:utf-8 -*- + +import datetime +import lxml.html +from lxml import etree +import re +import os +from docx import Document +from docx.enum.style import WD_STYLE_TYPE +from docx.shared import Pt + +try: + from urllib.request import urlopen, Request +except ImportError: + from urllib2 import urlopen, Request + +def today(): + today = datetime.datetime.today().date() + return str(today) + # return '2015-04-27' + +def get_rmrb_top10(save_dir): + url = r'http://paperpost.people.com.cn/rmrb-%s.html' %today() + text = urlopen(url).read() + + pattern = re.compile(r'