Python抓取指定网页以及该网页上所有链接
发布时间:2020-05-25 00:00:26 所属栏目:Python 来源:互联网
导读:Python抓取指定网页以及该网页上所有链接
|
下面是脚本之家 jb51.cc 通过网络收集整理的代码片段。 脚本之家小编现在分享给大家,也给大家做个参考。 #!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************************************************************
# Copyright (C) 2010 [emailprotected]
# Author: yangyingchao <[emailprotected]>
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2,or (at your option) any later
# version.
# This program is distributed in the hope that it will be useful,but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
# more details.
# You should have received a copy of the GNU General Public License along with
# GNU Emacs; see the file COPYING. If not,write to the Free Software
# Foundation,Inc.,59 Temple Place - Suite 330,Boston,MA 02111-1307,USA.
# ****************************************************************************
from copy import deepcopy
from sgmllib import SGMLParser
from xml.dom.minidom import *
import os
import re
import sys
import urllib2
title = "Untitled"
class MyParser(SGMLParser):
def __init__(self):
self.data = ""
self.links = []
self.TAG_BEG = False
self.TAG_END = False
SGMLParser.__init__(self,0)
def handle_data(self,data):
if (self.TAG_BEG is True) and (self.TAG_END is False):
self.data += data
pass
def start_title(self,attrs):
self.link = ""
self.data=""
self.TAG_BEG = True
self.TAG_END = False
for (key,val) in attrs:
if key == "href":
self.link = val
def end_title(self):
self.TAG_BEG = False
self.TAG_END = True
self.title = self.data.strip()
def flush(self):
pass
def handle_comment(self,data):
pass
def start_a(self,attrs):
self.data=""
self.TAG_BEG = True
self.TAG_END = False
for (key,val) in attrs:
if key == "href":
self.link = val
def end_a(self):
self.TAG_BEG = False
self.TAG_END = True
tmp = {}
tmp["name"] = self.data
tmp["link"] = self.link
self.links.append(deepcopy(tmp))
def unknown_starttag(self,tag,attrs):
pass
def unknown_endtag(self,tag):
pass
def unknown_entityref(self,ref):
pass
def unknown_charref(self,ref):
pass
def unknown_decl(self,data):
pass
def close(self):
SGMLParser.close(self)
self.flush()
def lst2str(lst):
string = ""
for item in lst:
string += item.strip()+ "n"
return string
def downURL(url,filename):
print "Download %s,save as %s"%(url,filename)
try:
fp = urllib2.urlopen(url)
except:
print "download exception"
print sys.exc_info()
return 0
op = open(filename,"wb")
while 1:
s = fp.read()
if not s:
break
op.write(s)
fp.close( )
op.close( )
return 1
def reptile(base_url):
"""
Download all articles from base_url.
Arguments:
- `base_url`: Url of website.
"""
page_list = []
if not len(base_url):
print "No page to reptile!"
sys.exit(1)
parser = MyParser()
if base_url.startswith("http"):
myopen = urllib2.urlopen
else:
myopen = open
try:
content = myopen(base_url).read()
except:
print "Failed to read from %s."%base_url
print sys.exc_info()
for item in content:
parser.feed(item)
for tmp in parser.links:
page_list.append(tmp.get("link"))
global title
title = parser.title
parser.close()
item_list = list(set(page_list))
for item in item_list:
# Strip '#' from url.
pos = item.find('#')
if pos != -1:
item = item[:pos]
# Added base_url to item if necessary
if not item.startswith("http"):
item = base_url.rstrip("/")+"/"+item
pass
local_file = item.split("/")[-1]
print item,local_file
if not local_file:
print "Empty local file! Continue from next one!"
continue
if os.access(local_file,os.F_OK):
print "File: %s existed,skip ..."%local_file
else:
ret = downURL(item,local_file)
# Remember to download the index file!
downURL(base_url,"index.html")
print "Total: %d articles."%(len(item_list))
pass
def walk_dir(lst,dirname,filenames):
for filename in filenames:
fn = os.path.join(dirname,filename)
if os.path.isdir(fn) or
not filename.endswith("html"):
continue
print "Processing: %s"%fn
tmp = {}
parser = MyParser()
content = open(fn).read()
for item in content:
parser.feed(item)
tmp["file"] = filename
tmp["title"] = parser.title
parser.close()
lst.append(deepcopy(tmp))
pass
def gen_index():
"""
Generate index of all htmls in this directory.
"""
file_lists = []
os.path.walk(".",walk_dir,file_lists)
fp = open("%s.devhelp2"%os.path.basename(os.getcwd()),"w")
string = '<?xml version="1.0" encoding="utf-8"?>n<book author=""' +
' language="c" link="index.html" name="" title="%s"'%title+
' version="2" xmlns="http://www.devhelp.net/book">n <chapters>'
for item in file_lists:
link = item.get("file")
try:
name =item.get("title").decode('gbk').encode('utf-8')
except:
name = item.get("title")
finally:
string += '<sub link="%s" name="%s"/>n'%(link,name)
string += 'n</chapters>n </book>n'
fp.write(string)
fp.close()
if __name__ == '__main__':
if len(sys.argv) != 2:
print "Usage: %s url of baidu space"%sys.argv[0]
print "Such as: %s http://hi.baidu.com/Username"
gen_index()
sys.exit(1)
base_url = sys.argv[1]
reptile (base_url)
gen_index()
以上是脚本之家(jb51.cc)为你收集整理的全部代码内容,希望文章能够帮你解决所遇到的程序开发问题。 如果觉得脚本之家网站内容还不错,欢迎将脚本之家网站推荐给程序员好友。 (编辑:安卓应用网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |
