加入收藏 | 设为首页 | 会员中心 | 我要投稿 安卓应用网 (https://www.0791zz.com/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 编程开发 > PHP > 正文

PHP爬虫_电影ftp下载地址

发布时间:2020-05-25 07:35:49 所属栏目:PHP 来源:互联网
导读:PHP爬虫_电影ftp下载地址

下面是脚本之家 jb51.cc 通过网络收集整理的代码片段。

脚本之家小编现在分享给大家,也给大家做个参考。

    <?php  
        declare(ticks = 1);  
        pcntl_signal(SIGQUIT,'signal_handler');  
        pcntl_signal(SIGTERM,'signal_handler');  
      
        $crawlers_pid = array();  
        $finish_count = 0;  
      
        //信号处理函数  
        function signal_handler($signal)   
        {  
            global $crawlers_pid;  
            if ($signal == SIGQUIT || $signal == SIGTERM)   
            {  
                foreach ($crawlers_pid as $pid) {  
                    posix_kill($pid,SIGTERM);  
                }  
                echo "---------- crawl task exit ----------";  
                global $con;//mysql  
                exit();  
            }  
        }  
      
        //GET方式获取链接对应页面内容  
        function get_page_content($url)   
        {  
            $content = file_get_contents($url);  
            return $content;  
        }  
      
        //POST方式获取链接对应页面内容  
        function get_page_content_by_post($url,$arr)  
        {  
            $arr = http_build_query($arr);  
            $opts = array (  
                'http' => array('method' => 'POST','header' => 'Content-type:application/x-www-form-urlencoded'.' Content-Length:'.strlen($data).'"','content' => $data)  
            );  
            $context = stream_context_create($opts);  
            $content = file_get_contents($url,false,$context);  
            return $content;  
        }  
      
        //dy2018抓取主流程  
        function run_dy2018()   
        {  
            global $crawlers_pid;  
            global $finish_count;  
            $crawl_urls = array("http://www.dy2018.com/html/tv/hytv/","http://www.dy2018.com/html/tv/hepai/","http://www.dy2018.com/html/tv/gangtai/","http://www.dy2018.com/html/tv/oumeitv/","http://www.dy2018.com/html/tv/rihantv/","http://www.dy2018.com/html/tv/tvzz/","http://www.dy2018.com/0/","http://www.dy2018.com/1/","http://www.dy2018.com/2/","http://www.dy2018.com/3/","http://www.dy2018.com/4/","http://www.dy2018.com/5/","http://www.dy2018.com/6/","http://www.dy2018.com/7/","http://www.dy2018.com/8/","http://www.dy2018.com/9/","http://www.dy2018.com/10/","http://www.dy2018.com/11/","http://www.dy2018.com/12/","http://www.dy2018.com/13/","http://www.dy2018.com/14/","http://www.dy2018.com/15/","http://www.dy2018.com/16/","http://www.dy2018.com/17/","http://www.dy2018.com/18/","http://www.dy2018.com/19/","http://www.dy2018.com/20/");  
      
            $i = 0;  
            while($i < count($crawl_urls))   
            {  
                $pid = pcntl_fork();  
                if($pid == -1) {  
                    echo "system error. check it now!";  
                    exit();  
                } else if($pid > 0){  
                    $crawlers_pid[$i] = $pid;  
                } else {  
                    $url = $crawl_urls[$i];  
                    $con = mysql_connect("localhost","root","123456");  
                    if(!$con) {  
                        die('Count not connect: '.mysql_error());  
                    }  
                    mysql_select_db("mysql",$con);  
                    crawl_process($url);  
                    $finish_count++;  
                }  
                $i++;  
            }  
      
            //pcntl_waitpid可能会导致信号监听失败  
            while (true) {   
                if($finish_count == count($crawlers_pid)) {  
                    echo "---------- crawl task finish ----------";  
                    mysql_close();  
                    exit();  
                }  
                sleep(1);  
            }  
      
        }  
      
        //从入口链接到其下所有下载页链接抓取过程  
        function crawl_process($url)  
        {  
            echo "start handle url:".$url;  
            $page_idx = 1;  
            $valid_tag = true;  
            $info_url_pattern = '//i/d+.html/';  
            $ftp_url_pattern = '/ftp://.*?.(swf|avi|flv|mpg|rm|mov|wav|asf|3gp|mkv|rmvb)/i';//^$两个符号不起作用  
            while($valid_tag) {  
                $page_url = get_page_index_url($url,$page_idx);  
                printf("start crawl url:".$page_url."n");  
                $page_content = get_page_content($page_url);  
                $valid_tag = is_valid_page($page_content);  
                if($valid_tag) {  
                    $matches_urls = array();  
                    preg_match_all($info_url_pattern,$page_content,$matches_urls);  
                    $page_content = mb_convert_encoding($page_content,"UTF-8","GBK");  
                    for($i=0; $i<count($matches_urls[0]); $i++) {  
                        $detail_url = 'http://www.dy2018.com'.$matches_urls[0][$i];  
                        $detail_page_content = get_page_content($detail_url);  
                        $detail_page_content = mb_convert_encoding($detail_page_content,"GBK");  
                        preg_match_all($ftp_url_pattern,$detail_page_content,$ftp_urls);  
                        $ftp_links = array();  
                        for($j=0;$j<count($ftp_urls[0]); $j++) {  
      
                            $ftp_links[$j] = $ftp_urls[0][$j];  
                        }  
                        $ftp_links_unique = array_values(array_unique($ftp_links));  
      
                        foreach ($ftp_links_unique as $ftp_link) {  
                            mysql_query("insert into dy2018_url (url,status) values('$ftp_link','0')");  
                            // echo mysql_error();//打印mysql错误  
                        }  
                        sleep(1);  
                    }  
                }  
                $page_idx++;  
            }  
        }  
      
        //获取页码对应的url链接  
        function get_page_index_url($url,$idx)   
        {  
            $idx_url = $url;  
            if($idx == 1) {  
                $idx_url = $idx_url.'index.html';  
            } else if($idx > 1){  
                $idx_url = $idx_url.'index_'.$idx.'.html';  
            }  
            return $idx_url;  
        }  
      
        //根据页面内容判断链接是否有效  
        function is_valid_page($content)  
        {  
            return $content?true:false;  
        }  
        run_dy2018();  
        mysql_close();  
    ?>  

以上是脚本之家(jb51.cc)为你收集整理的全部代码内容,希望文章能够帮你解决所遇到的程序开发问题。

如果觉得脚本之家网站内容还不错,欢迎将脚本之家网站推荐给程序员好友。

(编辑:安卓应用网)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读