python书籍信息爬虫实例
发布时间:2020-05-24 19:31:16 所属栏目:Python 来源:互联网
导读:python书籍信息爬虫示例,供大家参考,具体内容如下背景说明需要收集一些书籍信息,以豆瓣书籍条目作为源,得到一些有效书籍信息,并保存到本地数据库。
|
python书籍信息爬虫示例,供大家参考,具体内容如下 背景说明 需要收集一些书籍信息,以豆瓣书籍条目作为源,得到一些有效书籍信息,并保存到本地数据库。 获取书籍分类标签 具体可参考这个链接: 然后将这些分类标签链接存到本地某个文件,存储内容如下 https://book.douban.com/tag/小说 https://book.douban.com/tag/外国文学 https://book.douban.com/tag/文学 https://book.douban.com/tag/随笔 https://book.douban.com/tag/中国文学 https://book.douban.com/tag/经典 https://book.douban.com/tag/日本文学 https://book.douban.com/tag/散文 https://book.douban.com/tag/村上春树 https://book.douban.com/tag/诗歌 https://book.douban.com/tag/童话 ...... 获取书籍信息,并保存本地数据库 假设已经建好mysql表,如下: CREATE TABLE `book_info` ( `id` int(11) NOT NULL AUTO_INCREMENT,`bookid` varchar(64) NOT NULL COMMENT 'book ID',`tag` varchar(32) DEFAULT '' COMMENT '分类目录',`bookname` varchar(256) NOT NULL COMMENT '书名',`subname` varchar(256) NOT NULL COMMENT '二级书名',`author` varchar(256) DEFAULT '' COMMENT '作者',`translator` varchar(256) DEFAULT '' COMMENT '译者',`press` varchar(128) DEFAULT '' COMMENT '出版社',`publishAt` date DEFAULT '0000-00-00' COMMENT '出版日期',`stars` float DEFAULT '0' COMMENT '评分',`price_str` varchar(32) DEFAULT '' COMMENT '价格string',`hotcnt` int(11) DEFAULT '0' COMMENT '评论人数',`bookdesc` varchar(8192) DEFAULT NULL COMMENT '简介',`updateAt` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '修改日期',PRIMARY KEY (`id`),UNIQUE KEY `idx_bookid` (`bookid`),KEY `idx_bookname` (`bookname`),KEY `hotcnt` (`hotcnt`),KEY `stars` (`stars`),KEY `idx_tag` (`tag`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='书籍信息'; 并已实现相关爬虫逻辑,主要用到了BeautifulSoup包,如下:
#!/usr/bin/python
# coding: utf-8
import re
import logging
import requests
import pymysql
import random
import time
import datetime
from hashlib import md5
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.INFO,format='[%(levelname)s][%(name)s][%(asctime)s]%(message)s',datefmt='%Y-%m-%d %H:%M:%S')
class DestDB:
Host = "192.168.1.10"
DB = "spider"
Table = "book_info"
User = "test"
Pwd = "123456"
def connect_db(host,db,user,pwd):
conn = pymysql.connect(
host=host,user=user,passwd=pwd,db=db,charset='utf8',connect_timeout=3600) #,# cursorclass=pymysql.cursors.DictCursor)
conn.autocommit(True)
return conn
def disconnect_db(conn,cursor):
cursor.close()
conn.close()
#提取评价人数,如果评价人数少于10人,按10人处理
def hotratings(person):
try:
ptext = person.get_text().split()[0]
pc = int(ptext[1:len(ptext)-4])
except ValueError:
pc = int(10)
return pc
# 持久化到数据库
def save_to_db(tag,book_reslist):
dest_conn = connect_db(DestDB.Host,DestDB.DB,DestDB.User,DestDB.Pwd)
dest_cursor = dest_conn.cursor()
isql = "insert ignore into book_info "
isql += "(`bookid`,`tag`,`author`,`translator`,`bookname`,`subname`,`press`,"
isql += "`publishAt`,`price_str`,`stars`,`hotcnt`,`bookdesc`) values "
isql += ",".join(["(%s)" % ",".join(['%s']*12)]*len(book_reslist))
values = []
for row in book_reslist:
# 暂时将md5(bookname+author)作为bookid唯一指
bookid = md5(("%s_%s"%(row[0],row[2])).encode('utf-8')).hexdigest()
values.extend([bookid,tag]+row[:10])
dest_cursor.execute(isql,tuple(values))
disconnect_db(dest_conn,dest_cursor)
# 处理每一次访问的页面
def do_parse(tag,url):
page_data = requests.get(url)
soup = BeautifulSoup(page_data.text.encode("utf-8"),"lxml")
# 提取标签信息
tag = url.split("?")[0].split("/")[-1]
# 抓取作者,出版社信息
details = soup.select("#subject_list > ul > li > div.info > div.pub")
# 抓取评分
scores = soup.select("#subject_list > ul > li > div.info > div.star.clearfix > span.rating_nums")
# 抓取评价人数
persons = soup.select("#subject_list > ul > li > div.info > div.star.clearfix > span.pl")
# 抓取书名
booknames = soup.select("#subject_list > ul > li > div.info > h2 > a")
# 抓取简介
descs = soup.select("#subject_list > ul > li > div.info > p")
# 从标签信息中分离内容
book_reslist = []
for detail,score,personCnt,bookname,desc in zip(details,scores,persons,booknames,descs):
try:
subtitle = ""
title_strs = [s.replace('n','').strip() for s in bookname.strings]
title_strs = [s for s in title_strs if s]
# 部分书籍有二级书名
if not title_strs:
continue
elif len(title_strs) >= 2:
bookname,subtitle = title_strs[:2]
else:
bookname = title_strs[0]
# 评分人数
hotcnt = hotratings(personCnt)
desc = desc.get_text()
stars = float('%.1f' % float(score.get_text() if score.get_text() else "-1"))
author,translator,press,publishAt,price = [""]*5
detail_texts = detail.get_text().replace('n','').split("/")
detail_texts = [s.strip() for s in detail_texts]
# 部分书籍无译者信息
if len(detail_texts) == 4:
author,price = detail_texts[:4]
elif len(detail_texts) >= 5:
author,price = detail_texts[:5]
else:
continue
# 转换出版日期为date类型
if re.match('^[d]{4}-[d]{1,2}',publishAt):
dts = publishAt.split('-')
publishAt = datetime.date(int(dts[0]),int(dts[1]),1)
else:
publishAt = datetime.date(1000,1,1)
book_reslist.append([author,subtitle,price,stars,hotcnt,desc])
except Exception as e:
logging.error(e)
logging.info("insert count: %d" % len(book_reslist))
if len(book_reslist) > 0:
save_to_db(tag,book_reslist)
book_reslist = []
return len(details)
def main():
with open("book_tags.txt") as fd:
tags = fd.readlines()
for tag in tags:
tag = tag.strip()
logging.info("current tag url: %s" % tag)
for idx in range(0,1000000,20):
try:
url = "%s?start=%d&type=T" % (tag.strip(),idx)
cnt = do_parse(tag.split('/')[-1],url)
if cnt < 10:
break
# 睡眠若干秒,降低访问频率
time.sleep(random.randint(10,15))
except Exception as e:
logging.warn("outer_err: %s" % e)
time.sleep(300)
if __name__ == "__main__":
main()
小结 以上代码基于python3环境来运行; 以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持编程小技巧。 您可能感兴趣的文章:
(编辑:安卓应用网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |
