PHP爬虫_电影ftp下载地址
发布时间:2020-05-25 07:35:49 所属栏目:PHP 来源:互联网
导读:PHP爬虫_电影ftp下载地址
|
下面是脚本之家 jb51.cc 通过网络收集整理的代码片段。 脚本之家小编现在分享给大家,也给大家做个参考。 <?php
declare(ticks = 1);
pcntl_signal(SIGQUIT,'signal_handler');
pcntl_signal(SIGTERM,'signal_handler');
$crawlers_pid = array();
$finish_count = 0;
//信号处理函数
function signal_handler($signal)
{
global $crawlers_pid;
if ($signal == SIGQUIT || $signal == SIGTERM)
{
foreach ($crawlers_pid as $pid) {
posix_kill($pid,SIGTERM);
}
echo "---------- crawl task exit ----------";
global $con;//mysql
exit();
}
}
//GET方式获取链接对应页面内容
function get_page_content($url)
{
$content = file_get_contents($url);
return $content;
}
//POST方式获取链接对应页面内容
function get_page_content_by_post($url,$arr)
{
$arr = http_build_query($arr);
$opts = array (
'http' => array('method' => 'POST','header' => 'Content-type:application/x-www-form-urlencoded'.' Content-Length:'.strlen($data).'"','content' => $data)
);
$context = stream_context_create($opts);
$content = file_get_contents($url,false,$context);
return $content;
}
//dy2018抓取主流程
function run_dy2018()
{
global $crawlers_pid;
global $finish_count;
$crawl_urls = array("http://www.dy2018.com/html/tv/hytv/","http://www.dy2018.com/html/tv/hepai/","http://www.dy2018.com/html/tv/gangtai/","http://www.dy2018.com/html/tv/oumeitv/","http://www.dy2018.com/html/tv/rihantv/","http://www.dy2018.com/html/tv/tvzz/","http://www.dy2018.com/0/","http://www.dy2018.com/1/","http://www.dy2018.com/2/","http://www.dy2018.com/3/","http://www.dy2018.com/4/","http://www.dy2018.com/5/","http://www.dy2018.com/6/","http://www.dy2018.com/7/","http://www.dy2018.com/8/","http://www.dy2018.com/9/","http://www.dy2018.com/10/","http://www.dy2018.com/11/","http://www.dy2018.com/12/","http://www.dy2018.com/13/","http://www.dy2018.com/14/","http://www.dy2018.com/15/","http://www.dy2018.com/16/","http://www.dy2018.com/17/","http://www.dy2018.com/18/","http://www.dy2018.com/19/","http://www.dy2018.com/20/");
$i = 0;
while($i < count($crawl_urls))
{
$pid = pcntl_fork();
if($pid == -1) {
echo "system error. check it now!";
exit();
} else if($pid > 0){
$crawlers_pid[$i] = $pid;
} else {
$url = $crawl_urls[$i];
$con = mysql_connect("localhost","root","123456");
if(!$con) {
die('Count not connect: '.mysql_error());
}
mysql_select_db("mysql",$con);
crawl_process($url);
$finish_count++;
}
$i++;
}
//pcntl_waitpid可能会导致信号监听失败
while (true) {
if($finish_count == count($crawlers_pid)) {
echo "---------- crawl task finish ----------";
mysql_close();
exit();
}
sleep(1);
}
}
//从入口链接到其下所有下载页链接抓取过程
function crawl_process($url)
{
echo "start handle url:".$url;
$page_idx = 1;
$valid_tag = true;
$info_url_pattern = '//i/d+.html/';
$ftp_url_pattern = '/ftp://.*?.(swf|avi|flv|mpg|rm|mov|wav|asf|3gp|mkv|rmvb)/i';//^$两个符号不起作用
while($valid_tag) {
$page_url = get_page_index_url($url,$page_idx);
printf("start crawl url:".$page_url."n");
$page_content = get_page_content($page_url);
$valid_tag = is_valid_page($page_content);
if($valid_tag) {
$matches_urls = array();
preg_match_all($info_url_pattern,$page_content,$matches_urls);
$page_content = mb_convert_encoding($page_content,"UTF-8","GBK");
for($i=0; $i<count($matches_urls[0]); $i++) {
$detail_url = 'http://www.dy2018.com'.$matches_urls[0][$i];
$detail_page_content = get_page_content($detail_url);
$detail_page_content = mb_convert_encoding($detail_page_content,"GBK");
preg_match_all($ftp_url_pattern,$detail_page_content,$ftp_urls);
$ftp_links = array();
for($j=0;$j<count($ftp_urls[0]); $j++) {
$ftp_links[$j] = $ftp_urls[0][$j];
}
$ftp_links_unique = array_values(array_unique($ftp_links));
foreach ($ftp_links_unique as $ftp_link) {
mysql_query("insert into dy2018_url (url,status) values('$ftp_link','0')");
// echo mysql_error();//打印mysql错误
}
sleep(1);
}
}
$page_idx++;
}
}
//获取页码对应的url链接
function get_page_index_url($url,$idx)
{
$idx_url = $url;
if($idx == 1) {
$idx_url = $idx_url.'index.html';
} else if($idx > 1){
$idx_url = $idx_url.'index_'.$idx.'.html';
}
return $idx_url;
}
//根据页面内容判断链接是否有效
function is_valid_page($content)
{
return $content?true:false;
}
run_dy2018();
mysql_close();
?>
以上是脚本之家(jb51.cc)为你收集整理的全部代码内容,希望文章能够帮你解决所遇到的程序开发问题。 如果觉得脚本之家网站内容还不错,欢迎将脚本之家网站推荐给程序员好友。 (编辑:安卓应用网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |
