Python抓取指定网页以及该网页上所有链接

发布时间：2020-05-25 00:00:26 所属栏目：Python 来源：互联网

导读：Python抓取指定网页以及该网页上所有链接

下面是脚本之家 jb51.cc 通过网络收集整理的代码片段。

脚本之家小编现在分享给大家，也给大家做个参考。

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************************************************************
# Copyright (C) 2010 [emailprotected]
 
# Author: yangyingchao <[emailprotected]>
 
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2,or (at your option) any later
# version.
 
# This program is distributed in the hope that it will be useful,but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
# more details.
 
# You should have received a copy of the GNU General Public License along with
# GNU Emacs; see the file COPYING.  If not,write to the Free Software
# Foundation,Inc.,59 Temple Place - Suite 330,Boston,MA 02111-1307,USA.
# ****************************************************************************
 
from copy import deepcopy
from sgmllib import SGMLParser
from xml.dom.minidom import *
import os
import re
import sys
import urllib2
 
title = "Untitled"
 
class MyParser(SGMLParser):
 
    def __init__(self):
        self.data = ""
        self.links = []
        self.TAG_BEG = False
        self.TAG_END = False
        SGMLParser.__init__(self,0)
 
    def handle_data(self,data):
        if (self.TAG_BEG is True) and (self.TAG_END is False):
            self.data += data
        pass
 
    def start_title(self,attrs):
        self.link = ""
        self.data=""
 
        self.TAG_BEG = True
        self.TAG_END = False
        for (key,val) in attrs:
            if key == "href":
                self.link = val
 
    def end_title(self):
        self.TAG_BEG = False
        self.TAG_END = True
 
        self.title = self.data.strip()
 
 
    def flush(self):
        pass
 
    def handle_comment(self,data):
        pass
 
    def start_a(self,attrs):
        self.data=""
 
        self.TAG_BEG = True
        self.TAG_END = False
        for (key,val) in attrs:
            if key == "href":
                self.link = val
 
    def end_a(self):
        self.TAG_BEG = False
        self.TAG_END = True
        tmp = {}
        tmp["name"] = self.data
        tmp["link"] = self.link
        self.links.append(deepcopy(tmp))
 
 
    def unknown_starttag(self,tag,attrs):
        pass
 
    def unknown_endtag(self,tag):
        pass
 
 
    def unknown_entityref(self,ref):
        pass
 
    def unknown_charref(self,ref):
        pass
 
    def unknown_decl(self,data):
        pass
 
    def close(self):
        SGMLParser.close(self)
        self.flush()
 
def lst2str(lst):
    string = ""
    for item in lst:
        string += item.strip()+ "n"
    return string
 
def downURL(url,filename):
    print "Download %s,save as %s"%(url,filename)
    try:
        fp = urllib2.urlopen(url)
    except:
        print "download exception"
        print sys.exc_info()
        return 0
    op = open(filename,"wb")
    while 1:
        s = fp.read()
        if not s:
            break
        op.write(s)
    fp.close( )
    op.close( )
    return 1
 
 
def reptile(base_url):
    """
    Download all articles from base_url.
    Arguments:
    - `base_url`: Url of website.
    """
    page_list = []
    if not len(base_url):
        print "No page to reptile!"
        sys.exit(1)
 
    parser = MyParser()
 
    if base_url.startswith("http"):
        myopen = urllib2.urlopen
    else:
        myopen = open
 
    try:
        content = myopen(base_url).read()
    except:
        print "Failed to read from %s."%base_url
        print sys.exc_info()
 
    for item in content:
        parser.feed(item)
 
    for tmp in parser.links:
        page_list.append(tmp.get("link"))
 
    global title
    title = parser.title
    parser.close()
 
    item_list = list(set(page_list))
 
    for item in item_list:
        # Strip '#' from url.
        pos = item.find('#')
        if pos != -1:
            item = item[:pos]
 
        # Added base_url to item if necessary
        if not item.startswith("http"):
            item = base_url.rstrip("/")+"/"+item
            pass
 
        local_file = item.split("/")[-1]
        print item,local_file
        if not local_file:
            print "Empty local file! Continue from next one!"
            continue
 
        if os.access(local_file,os.F_OK):
            print "File: %s existed,skip ..."%local_file
        else:
            ret = downURL(item,local_file)
 
    # Remember to download the index file!
    downURL(base_url,"index.html")
    print "Total: %d articles."%(len(item_list))
    pass
 
 
def walk_dir(lst,dirname,filenames):
    for filename in filenames:
        fn = os.path.join(dirname,filename)
        if os.path.isdir(fn) or 
               not filename.endswith("html"):
            continue
        print "Processing: %s"%fn
        tmp = {}
        parser = MyParser()
        content = open(fn).read()
        for item in content:
            parser.feed(item)
        tmp["file"] = filename
        tmp["title"] = parser.title
        parser.close()
        lst.append(deepcopy(tmp))
    pass
 
def gen_index():
    """
    Generate index of all htmls in this directory.
    """
    file_lists = []
    os.path.walk(".",walk_dir,file_lists)
 
    fp = open("%s.devhelp2"%os.path.basename(os.getcwd()),"w")
    string = '<?xml version="1.0" encoding="utf-8"?>n<book author=""' +
        ' language="c" link="index.html" name="" title="%s"'%title+
        ' version="2" xmlns="http://www.devhelp.net/book">n  <chapters>'
    for item in file_lists:
        link = item.get("file")
        try:
            name =item.get("title").decode('gbk').encode('utf-8')
        except:
            name = item.get("title")
        finally:
            string += '<sub link="%s" name="%s"/>n'%(link,name)
 
    string +=   'n</chapters>n   </book>n'
    fp.write(string)
    fp.close()
 
if __name__ == '__main__':
    if len(sys.argv) != 2:
        print "Usage: %s url of baidu space"%sys.argv[0]
        print "Such as: %s http://hi.baidu.com/Username"
        gen_index()
        sys.exit(1)
    base_url = sys.argv[1]
    reptile (base_url)
    gen_index()

以上是脚本之家(jb51.cc)为你收集整理的全部代码内容，希望文章能够帮你解决所遇到的程序开发问题。

如果觉得脚本之家网站内容还不错，欢迎将脚本之家网站推荐给程序员好友。

（编辑：安卓应用网）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!