全宋词爬取过程及数据分析
Posted March 07, 2017
由于某个公众号对我仓库chinese-poetry的推广, 短时间大量涨粉, 有人想要宋词的数据。 于是最近利用零散时间对全宋词进行爬取分析, 并做了简单的分析, 发现了一些不得了的事情。分析仅仅对全宋词的内容进行了关键字排名分析、 宋词作者产量分析、 最受欢迎的词牌名排名分析
关键字排名分析
宋人喜欢用东风, 东风作为现代也会微妙, 人间、何处从唐诗就开始蝉联前三. 即使到了现代, 这两个词依存古风.
宋词作者产量分析
辛弃疾果不其然的成为两宋现存词最多的作家, 还有一些虽然产量丰富但未必是我们熟知的。
最受欢迎的词牌名排名分析
浣溪沙作为婉约 豪放两派所常用的词牌, 在两宋时期作为最受欢迎也是理所应当.
爬取逻辑没有做相应的系统化处理, 只是简单的脚本, 配置交互式界面做的操作。采用的相关技术: Python + parsel + peewee + requests + jieba
附上爬取解析脚本的逻辑:
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
''' | |
File Name: parser.py | |
Author: JackeyGao | |
mail: gaojunqi@outlook.com | |
''' | |
import sys | |
import random | |
import time | |
import requests | |
import re | |
from parsel import Selector | |
from peewee import IntegrityError | |
from db import Ci | |
from db import CiAuthor | |
header = { | |
"Connection": "keep-alive", | |
"Origin": "http://qsc.zww.cn", | |
"Upgrade-Insecure-Requests": "1", | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", | |
"Content-Type": "application/x-www-form-urlencoded", | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
"Referer": "http://qsc.zww.cn/", | |
"Accept-Encoding": "gzip, deflate", | |
"Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4", | |
"Cookie": "Hm_lvt_12506b8a4147836b0046047de09b2a2e=1493688567; _D_SID=92CED13DD066A18AEC64F1086BA2B715; ASPSESSIONIDSABSRATC=OOFAEFEAJAGIAIEMGGAEDBNL; UM_distinctid=15c6821bb13453-0fd27be8dc79a5-30657509-13c680-15c6821bb14468; CNZZDATA618132=cnzz_eid%3D761011847-1496395659-null%26ntime%3D1496395659" | |
} | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
seek_patt = re.compile(r"\((.*?)\)", re.I|re.X) | |
# --------------------------------- | |
class QTSBase(object): | |
def filllist(self, content): | |
self.content = content | |
def fillpage(self, fillpage): | |
self.page = fillpage | |
def fillbody(self, content): | |
self.content = content | |
class ParentBase(object): | |
def __init__(self): | |
self.QTS = QTSBase() | |
# ---------------------------------- | |
parent = ParentBase() | |
exec("parent.QTS.fillpage('第1页 共92页 1564条')") | |
def __with_seek_type__(seek_type): | |
def request(pageno, value=''): | |
url = 'http://qsc.zww.cn/getdata.asp' | |
payload = { | |
'seektype': seek_type, | |
'seekvalue': value, | |
'pageno': int(pageno) | |
} | |
resp = requests.post( | |
url, | |
data=payload, | |
headers=header | |
) | |
return resp | |
return request | |
def parse(html, callback, *args, **kwargs): | |
html = html.decode('utf8') | |
html = html.encode('latin1') | |
html = html.decode('gb2312', 'ignore') | |
sel = Selector(text=html) | |
return callback(sel, *args, **kwargs) | |
def callback_author_list(sel, *args, **kwargs): | |
data = sel.xpath('//script').extract()[0] | |
for l in data.splitlines(): | |
if not l.startswith('parent.QTS.filllist'): | |
continue | |
exec(l) | |
sel = Selector( | |
text=unicode(parent.QTS.content) | |
) | |
for i in sel.xpath('//a'): | |
seek = i.xpath('@onclick').extract()[0] | |
seek = seek_patt.findall(seek)[0] | |
_type, value, pageno = seek.split(',') | |
text = i.xpath('text()').extract()[0] | |
if _type != '10': | |
continue | |
name = text.replace('…', '') | |
# save author to database. | |
try: | |
CiAuthor.create( | |
value = value, | |
name = name | |
) | |
print("主键%s, 已创建." % value) | |
except IntegrityError: | |
print("重复主键%s, 已跳过." % value) | |
def callback_author_info(sel, *args, **kwargs): | |
data = sel.xpath('//script').extract()[0] | |
for l in data.splitlines(): | |
if not l.startswith('parent.QTS.fillbody'): | |
continue | |
exec(l) | |
sel = Selector( | |
text=unicode(parent.QTS.content) | |
) | |
ds = sel.xpath('//text()').extract() | |
name = sel.xpath('//text()').extract()[1] | |
lon = ''.join([s.strip() for s in ds[5:]]).strip() | |
author = kwargs["author"] | |
author.long_desc = lon | |
#author.short_desc = sht | |
author.save() | |
print("主键%s(%s), 已更新" % (author.value, author.name)) | |
return sel | |
return sel | |
def callback_ci_info(sel, *args, **kwargs): | |
data = sel.xpath('//script').extract()[0] | |
for l in data.splitlines(): | |
if not l.startswith('parent.QTS.fillbody'): | |
continue | |
if '宋体' in l: | |
continue | |
exec(l) | |
sel = Selector( | |
text=unicode(parent.QTS.content) | |
) | |
value = kwargs["seekid"] | |
rhythmic = sel.xpath('//b/text()').extract()[0] | |
author = sel.xpath('//text()').extract()[1] | |
contents = sel.xpath('//text()').extract()[2:] | |
content = '\n'.join(contents) | |
try: | |
Ci.create( | |
value = value, | |
rhythmic = rhythmic, | |
author = author, | |
content = content | |
) | |
print("主键%s, 已创建." % value) | |
except IntegrityError: | |
Ci.update( | |
rhythmic = rhythmic, | |
author = author, | |
content = content | |
).where( | |
Ci.value == value | |
).execute() | |
print("重复主键%s, 已更新." % value) | |
return sel | |
f_author_list = __with_seek_type__(1) | |
f_author_info = __with_seek_type__(10) | |
f_ci_list = __with_seek_type__(5) | |
f_ci_info = __with_seek_type__(9) | |
#resp = f_ci_info(1, value=1460) | |
#sel = parse(resp.text, callback_ci_info, seekid=1) | |
if __name__ == '__main__': | |
for p in range(1, 93): | |
resp = f_author_list(p, value=1) | |
sel = parse(resp.text, callback_author_list) | |
# crawl author info | |
for i in CiAuthor.select().where(CiAuthor.value>0): | |
resp = f_author_info(1, value=i.value) | |
sel = parse(resp.text, callback_author_info, author=i) | |
# crawl author ci list | |
for i in range(1, 21051): | |
try: | |
resp = f_ci_info(1, value=i) | |
except requests.exceptions.ConnectionError as e: | |
wait_seconds = random.choice(range(1, 10)) | |
print("等待%s..异常(%s)" % (wait_seconds, str(e))) | |
time.sleep(wait_seconds) | |
continue | |
sel = parse(resp.text, callback_ci_info, seekid=i) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
''' | |
File Name: db.py | |
Author: JackeyGao | |
mail: junqi.gao@shuyun.com | |
''' | |
from peewee import * | |
db = SqliteDatabase('ci.db') | |
class CiAuthor(Model): | |
value = IntegerField(primary_key=True) | |
name = CharField() | |
long_desc = TextField(null=True) | |
short_desc = TextField(null=True) | |
class Meta: | |
database = db # This model uses the "people.db" database. | |
class Ci(Model): | |
value = IntegerField(primary_key=True) | |
rhythmic = CharField() | |
author = CharField() | |
content = TextField(null=True) | |
class Meta: | |
database = db | |
#def delete_note(): | |
# return Note.delete().execute() | |
# | |
#def delete_image(): | |
# return Image.delete().execute() | |
def init_table(): | |
db.connect() | |
db.create_tables([Ci, CiAuthor]) | |
if __name__ == '__main__': | |
init_table() |
运行
分别保存上面两个脚本为parse.py和db.py, 然后执行以下命令
$ pip install peewee parsel requests
$ python db.py # 初始化数据库
$ python parse.py