加入收藏 | 设为首页 | 会员中心 | 我要投稿 安卓应用网 (https://www.0791zz.com/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 编程开发 > PHP > 正文

php实现的一个很好用HTML解析器类可用于采集数据

发布时间:2020-05-28 15:21:00 所属栏目:PHP 来源:互联网
导读:下面就是这个 HTML解析类及用法,下面的功能是采集www.opendir.cn这个网站的百度收录数据,需要的朋友可以测试下

<div class="codetitle"><a style="CURSOR: pointer" data="85400" class="copybut" id="copybut85400" onclick="doCopy('code85400')"> 代码如下:<div class="codebody" id="code85400">
<?php
$oldSetting = libxml_use_internal_errors( true );
libxml_clear_errors();
/*

-+-----------------------------------
|PHP5 Framework - 2011
|Web Site: www.iblue.cc
|E-mail: mejinke@gmail.com
|Date: 2012-10-12
-+-----------------------------------

@desc HTML解析器
@author jingke
/
class XF_HtmlDom
{
private $_xpath = null;
private $_nodePath = ''; public function construct($xpath = null,$nodePath = '')
{
$this->_xpath = $xpath;
$this->_nodePath = $nodePath;
} public function loadHtml($url)
{
ini_set('user_agent','Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML,like Gecko) Version/4.0 Mobile Safari/530.17 –Nexus');
$content = '';
if(strpos(strtolower($url),'http')===false)
{
$content = file_get_contents($url);
}
else
{
$ch = curl_init();
$user_agent = "Baiduspider+(+http://www.baidu.com/search/spider.htm)";
$user_agent1='Mozilla/5.0 (Windows NT 5.1; rv:6.0) Gecko/20100101 Firefox/6.0';
curl_setopt($ch,CURLOPT_URL,$url);
curl_setopt($ch,CURLOPT_HEADER,false);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch,CURLOPT_REFERER,CURLOPT_USERAGENT,$user_agent1);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION,1);
$content =curl_exec($ch);
curl_close($ch);
} $html = new DOMDocument();
$html->loadHtml($content);
$this->_xpath = new DOMXPath( $html );
//return $this; } public function find($query,$index = null)
{
if($this->_nodePath == '')
$this->_nodePath = '//';
else
$this->_nodePath .= '/'; $nodes = $this->_xpath->query($this->_nodePath.$query);
//echo $nodes->item(0)->getNodePath();exit;
if ($index == null && !is_numeric($index))
{
$tmp = array();
foreach ($nodes as $node)
{
$tmp[] = new XF_HtmlDom($this->_xpath,$node->getNodePath());
}
return $tmp;
}
return new XF_HtmlDom($this->_xpath,$this->_xpath->query($this->_nodePath.$query)->item($index)->getNodePath());
} /
获取内容
/
public function text()
{
if ($this->_nodePath != '' && $this->_xpath != null )
return $this->_xpath->query($this->_nodePath)->item(0)->textContent;
else
return false;
} /
获取属性值
/
public function getAttribute($name)
{
if ($this->_nodePath != '' && $this->_xpath != null )
return $this->_xpath->query($this->_nodePath)->item(0)->getAttribute($name);
else
return false;
} public function get($name)
{
if($name == 'innertext')
return $this->text();
else
return $this->getAttribute($name);
}
}
$xp = new xf_HtmlDom();
$xp->loadHtml('http://www.aizhan.com/siteall/www.opendir.cn/');
$rows = $xp->find("td[@id='baidu']/a",0)->innertext;
print_r($rows);

(编辑:安卓应用网)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读