加入收藏 | 设为首页 | 会员中心 | 我要投稿 安卓应用网 (https://www.0791zz.com/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 综合聚焦 > 程序设计 > 正文

聚合搜索(四)

发布时间:2020-05-23 02:27:12 所属栏目:程序设计 来源:互联网
导读:1.4 各搜索引擎专用类 搜索引擎专用类用来完成具体的搜索任务,例如,Baidu类用来执行Baidu搜索,Google类用来执行Google搜索。它们都继承自ISearch类,主要是执行一些正则表达式操作,把搜索结果匹配出来。搜索结果作为数据,保存在了XML文件中。最后,这个X

1.4 各搜索引擎专用类 搜索引擎专用类用来完成具体的搜索任务,例如,Baidu类用来执行Baidu搜索,Google类用来执行Google搜索。它们都继承自ISearch类,主要是执行一些正则表达式操作,把搜索结果匹配出来。搜索结果作为数据,保存在了XML文件中。最后,这个XML文件按照格式化文件result.xsl的格式把搜索结果显示出来。 Search()方法的大致思路是:首先调用GetPageString()方法把搜索的关键字、页码等信息发送到特定搜索引擎,并接收搜索引擎返回的信息。然后对这个信息进行解析,分析出搜索结果的每个记录,并记录在XML文件中。然后再对搜索结果的分页导航进行分析,转换成本系统的形式,并采用Base64编码,把结果记录在XML文件中。在Search()方法执行的最后,XML文件被格式化输出到客户端浏览器显示出来。下面是6大搜索引擎专用类的具体实现代码: //Google搜索类 google.cs

using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml;
public class Google : ISearch
{
public override System.Xml.XmlDocument Search()
{
string xmlstr = GetPageString(); //获取搜索页面的字符串
XmlDataDocument document = new XmlDataDocument();//用于返回的页面
document.LoadXml("<search/>");
string style = Tools.Match(xmlstr,"<head>[//s//S]*?</head>").Value;
style = Tools.delTagArray(style,"script,meta,title",true);
//删除其他标记
XmlNode xn = Tools.CreateNode(document,document.DocumentElement,"head");
xn.InnerText = Tools.delTagArray(style,"head",false);//删除head标记
//创建body
XmlNode body = Tools.CreateNode(document,
"body");
//搜索记录数描述
XmlNode txt = Tools.CreateNode(document,body,"key");
XmlNode count = Tools.CreateNode(document,"count");
string sou = Tools.Match(xmlstr,"(?<=符合<b>)[//s//S]*?(?=</b>的查
询)").Value; //记录总数
string count2 = Tools.Match(xmlstr,"(?<=约有<b>)[//s//S]*?(?=</
b>)").Value;
count.InnerText = count2;
txt.InnerText = sou; //记录总数
MatchCollection mtc = Tools.MatchCollection(xmlstr,"<div class[//s//S]*?
</div>");
StringBuilder sb = new StringBuilder(1000);
//遍历每个结果,把搜索结果插入xml文档中
foreach (Match mt in mtc)
{
XmlNode item = Tools.CreateNode(document,"item");
//在xml中插入一条搜索记录
XmlNode link = Tools.CreateNode(document,item,"url");
XmlNode desc = Tools.CreateNode(document,"desc");
XmlNode title = Tools.CreateNode(document,"title");
sb = sb.Remove(0,sb.Length);
sb.Append(Tools.Match(mt.Value,"(?<=<td[^>]*?>)[//s//S]*?(?=</td>)").
Value);
MatchCollection itemc = Tools.MatchCollection(sb.ToString(),"[//s
//S]*?<br[^>]*?>");
string ul = Tools.Match(mt.Value,"(?<=<h2[^>]*?>)[//s//S]*?(?=</h2>)").
Value;
ul = Tools.Match(ul,"<a[//s//S]*?</a>").Value;
string u_li = Tools.Match(ul,"(?<=href=[/"]?).*?(?=[/"]?[//s>])").
Value;
link.InnerText = Tools.Replace(u_li,"^/"","");
string u_t = Tools.delTagArray(ul,"a",false);
title.InnerText = Tools.delHtml(u_t); //删除html标记
if (itemc.Count > 0)
{
ul = Tools.delHtml(itemc[0].Value); desc.InnerText = ul; } XmlNode sour1 = Tools.CreateNode(document,"sour"); //搜索得到的网页信息 string str1 = Tools.Match(mt.Value,"(?<=<h2[^>]*?>)[//s//S]*?(?=<
/h2>)").Value; sour1.InnerText = str1; XmlNode sour2 = Tools.CreateNode(document,"sour"); string str2 = Tools.Match(mt.Value,"(?<=<td[^>]*?>)[//s//S]*? (?=
</td>)").Value; str2 = Tools.delTagArray(str2,"td",false); sour2.InnerText = str2; XmlNode begin = Tools.CreateNode(document,"begin"); //单条搜索结果开始 string str3 = Tools.Match(mt.Value,"(?=<div[^>]*?>)[//s//S]*?(?=
<h2)").Value; begin.InnerText = str3; XmlNode end = Tools.CreateNode(document,"end"); //单条搜索结果结束 str3 = Tools.Match(mt.Value,"(?<=</table>)[//s//S]*?(?<=</div>)").
Value; end.InnerText = str3; } string page = Tools.Match(xmlstr,"(?=<div id=navbar class=n>)[//s//S]*?(?
=<center>)").Value; MatchCollection mcpage = Tools.MatchCollection(page,"<a[^>]*?>[//s//
S]*?</a>"); foreach (Match mt in mcpage) //遍历每个页码,替换为本系统的形式 { string s2 = mt.Value; s2 = Tools.Match(s2,@"(?<=href=/search/?)[^/s>]*").Value; //和搜索相关的参数 page = page.Replace("/search?" + s2,"?nav_go_post=" + Tools.To
Base64(s2) + "&itemtype=" + ItemType); } XmlNode pageNv = Tools.CreateNode(document,"pageSite"); //插入分页导航 page = Tools.delTagArray(page,"img",false); pageNv.InnerText = page; //赋予导航内容 return document; } } //百度搜索类 baidu.cs using System; using System.Collections.Generic; using System.Text; using System.Xml; using System.Text.RegularExpressions; public class Baidu:ISearch { public override System.Xml.XmlDocument Search() { string xmlstr = GetPageString(); //获取搜索页 XmlDocument document = new XmlDocument(); //返回的页面 document.LoadXml("<search/>"); string style = Tools.Match(xmlstr,"<head>[//s//S]*?</head>").Value;
//css style = Tools.delTagArray(style,"script",true); //删除脚本 //创建头部 XmlNode xn = Tools.CreateNode(document,
"head"); //删除head标记 xn.InnerText = Tools.delTagArray(style,false); //创建body XmlNode body = Tools.CreateNode(document,
"body"); //搜索记录数的描述 XmlNode txt = Tools.CreateNode(document,"key"); XmlNode count = Tools.CreateNode(document,"count"); //记录总数 string sou = Tools.Match(xmlstr,"(?<=<input name=wd size=/"35/" class=
/"i/" value=/")[//s//S]*?(?=/" maxlength=/"100/")").Value; string count2 = Tools.Match(xmlstr,"(?<=找到相关网页[^//d])[//s//S]*?
(?=篇)").Value; count.InnerText = count2; txt.InnerText = sou ;//记录总数 //搜索结果的记录集 MatchCollection mtc = Tools.MatchCollection(xmlstr,"<table border=/"0/"
cellpadding=/"0/" cellspacing=/"0/">[//s//S]*?</table>"); StringBuilder sb = new StringBuilder(1000); //遍历每个结果,把搜索结果插入xml文档中 foreach (Match mt in mtc) { XmlNode item = Tools.CreateNode(document,"item"); //在xml中插入一条搜索记录 XmlNode link = Tools.CreateNode(document,"url"); //单条超链接 XmlNode desc = Tools.CreateNode(document,"desc"); //单条搜索结果的描述 XmlNode title = Tools.CreateNode(document,"title"); //单条搜索结果的标题 sb.Remove(0,sb.Length); //清空 sb.Append(Tools.Match(mt.Value,"(?<=<td[^>]*?>)[//s//S]*?(?=
</td>)").Value); MatchCollection itemc = Tools.MatchCollection(sb.ToString(),
"[//s//S]*?<br[^>]*?>"); if(itemc.Count>=3) { string u1 = Tools.Match(itemc[0].Value,"<a[//s//S]*?</
a>").Value; string u_li=Tools.Match(u1,"(?<=href=[/"]?).*?(?=[/"]?
[//s>])").Value; link.InnerText = Tools.Replace(u_li,""); string u_t = Tools.delTagArray(u1,false); title.InnerText = Tools.delHtml(u_t); //删除html标记 u1 = Tools.delHtml(itemc[1].Value); desc.InnerText = u1; foreach (Match mt1 in itemc) { //搜索的网页信息 XmlNode sour1 = Tools.CreateNode(document,
"sour"); string da = Tools.delTagArray(mt1.Value,"br",false); //删除br sour1.InnerText=da; } } XmlNode end = Tools.CreateNode(document,"end"); //搜索结果结束 end.InnerText ="</font><br/>"; } string page = Tools.Match(xmlstr,"<div class=/"p/">[//s//S]*?</div>").
Value; //分页 MatchCollection mcpage = Tools.MatchCollection(page,"<a[^>]*?>[//s
//S]*?</a>"); //遍历每个页码,替换为本系统的形式 foreach (Match mt in mcpage) { string s2 = mt.Value; s2 = Tools.Match(s2,@"(?<=href=s/?)[^/s>]*").Value; page = page.Replace("s?" + s2,"?nav_go_post="+Tools.ToBase64(s2)
+"&itemtype=" + ItemType); //替换超链接 } XmlNode pageNv = Tools.CreateNode(document,"pageSite"); //插入分页导航 pageNv.InnerText = page; //赋予导航内容 return document; } } //搜狗类 sogou.cs using System; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; using System.Xml; using System.Web; public class Sogou : ISearch { public override System.Xml.XmlDocument Search() { string xmlstr = GetPageString(); //获取搜索页 xmlstr = Tools.delTagArray(xmlstr,true); //删除脚本 XmlDataDocument document = new XmlDataDocument(); document.LoadXml("<search/>"); string style = Tools.Match(xmlstr,"(?=<style[^>]*?)[//s//S]*?(?<=</style>)").
Value;//css //头部 XmlNode head = Tools.CreateNode(document,"head"); head.InnerText = style; XmlNode body = Tools.CreateNode(document,"body"); //记录描述 XmlNode count = Tools.CreateNode(document,"count"); //记录总数 XmlNode txt = Tools.CreateNode(document,"key"); string sou = Tools.Match(xmlstr,"(?<=<input name=/"query/" type=/"text/" class
=/"query/" size=/"35/" tabindex=/"1/" value=/")[//s//S]*?(?=/"/>)").Value; string count2 = Tools.Match(xmlstr,"(?<=找到)[//s//S]*?(?=个网页)").
Value; count.InnerText = count2; txt.InnerText = sou; string xmlstr2 = Tools.Match(xmlstr,"(?<=<div id=/"content/">)[//s
//S]*?(?<=<div id=/"pagebar/">)").Value; MatchCollection mtc = Tools.MatchCollection(xmlstr2,"(<div>)[//s//S]*?
(</div>)"); //遍历每个结果,把搜索结果插入xml文档中 foreach (Match mt in mtc) { XmlNode item = Tools.CreateNode(document,"item"); XmlNode url = Tools.CreateNode(document,"url"); XmlNode desc = Tools.CreateNode(document,"desc"); XmlNode title = Tools.CreateNode(document,"title"); XmlNode begin = Tools.CreateNode(document,"begin"); XmlNode end = Tools.CreateNode(document,"end"); string ul = Tools.Match(mt.Value,"(?<=<h2>)[//s//S]*?(?=</h2>)").
Value; string ul_li = Tools.Match(ul,"(?<=href=[/"]?).*?(?=[/"]?[//s>])").
Value; url.InnerText = Tools.Replace(ul_li,"[/"']",""); string u_t = Tools.delTagArray(ul,false); //删除超链接 title.InnerText = Tools.delHtml(u_t); MatchCollection itemc = Tools.MatchCollection(mt.Value,"(?<=<p[^>]*?>)
[//s//S]*?(?=</p>)"); if (itemc.Count > 0) { string de = itemc[0].Value; de = Tools.delHtml(de); desc.InnerText = Tools.Replace(de,""); } XmlNode sour1 = Tools.CreateNode(document,"sour");//网页内容 string sout1str = Tools.Match(mt.Value,"(?<=<h2>)[//s//S]*?(?=
</h2>)").Value; sout1str = Tools.Replace(sout1str,"(<br^[>]*?)*",""); sour1.InnerText = Tools.Replace(sout1str,"(?=onclick=)[//s//S]*?
(?<=;/")",""); XmlNode sour2 = Tools.CreateNode(document,"sour"); string content = Tools.Match(mt.Value,"(?<=</h2>)[//s//S]*?(?=</
div>)").Value; content = Tools.Replace(content,""); Tools.CreateCData(document,sour2,content); //添加一条搜索记录 begin.InnerText = "<div>"; //每条记录的开始 end.InnerText = "</div>"; //每条记录的结束 } string page = Tools.Match(xmlstr,"(?=<!-- begin of page up/down -->)[//s
//S]*?(?=<!-- end of page up/down -->)").Value;//分页 MatchCollection mcpage = Tools.MatchCollection(page,"(?=<a[//s//S]*?
>)[//s//S]*?(?<=</a>)"); //遍历每个页码,替换为本系统的形式 foreach (Match mt in mcpage) { string s2 = mt.Value; s2 = Tools.Replace(s2,"/"",""); s2 = Tools.Match(s2,@"(?<=href=/?)[/s/S]*?(?=>)").Value; page = page.Replace("?" + s2,"?nav_go_post=" + Tools.ToBase64(s2)
+ "&itemtype=" + ItemType); } XmlNode pageNv = Tools.CreateNode(document,"pageSite");//插入分页导航 pageNv.InnerText = page; //赋予导航内容 return document; } } //爱问搜索类 iask.cs using System; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; using System.Xml; public class Iask : ISearch { public override System.Xml.XmlDocument Search() { string xmlstr = GetPageString(); //获取搜索页 XmlDataDocument document = new XmlDataDocument(); //返回的页面 document.LoadXml("<search/>"); XmlNode head = Tools.CreateNode(document,
"head"); XmlNode body = Tools.CreateNode(document,
"body"); //搜索记录数描述 XmlNode count = Tools.CreateNode(document,"count"); XmlNode txt = Tools.CreateNode(document,"(?<=<title>)[//s//S]*?(?= - 爱问搜索)").
Value; //记录总数 string count2 = Tools.Match(xmlstr,"(?<=找到 <span class=/"ar/">) [//s
//S]*?(?=</span> 篇网页)").Value; count.InnerText = count2; txt.InnerText = sou; string style = Tools.Match(xmlstr,"(?<=<head>)[//s//S]*?(?<=</head>)").
Value; style = Tools.Match(xmlstr,"(?=<style[^>]*?>)[//s//S]*?(?<=</style>)").
Value; style = Tools.delTagArray(style,true); head.InnerText = style; string xmlstr2 = Tools.Match(xmlstr,"(?<=<!-- 网页搜索结果 begin -->)[//s
//S]*?(?=<!-- 网页搜索结果 end -->)").Value; MatchCollection mtc = Tools.MatchCollection(xmlstr2,"<table[^>]*?>[//s
//S]*?</table>"); //遍历每个结果,把搜索结果插入xml文档中 foreach (Match mt in mtc) { XmlNode item = Tools.CreateNode(document,"title"); MatchCollection itemc = Tools.MatchCollection(mt.Value,"[//s//S]*?
(?<=<br[^>]*?>)"); string ul = Tools.Match(itemc[0].Value,"<a[//s//S]*?</a>").Value; string u_li = Tools.Match(ul,"(?<=href=[/"]?).*?(?=[/"]?[//s>])").
Value; url.InnerText = Tools.Replace(u_li,false); title.InnerText = Tools.delHtml(u_t); ul = Tools.delHtml(itemc[1].Value); desc.InnerText = ul; //网页信息 XmlNode sour1 = Tools.CreateNode(document,"sour"); sour1.InnerText = Tools.Match(itemc[0].Value,"<a[//s//S]*?</a>").
Value; XmlNode sour2 = Tools.CreateNode(document,"sour"); string str = ""; for (int j = 1; j < itemc.Count; j++) { str = str + itemc[j].Value; } sour2.InnerText = str; } string page = Tools.Match(xmlstr,"(?<=<!-- 左侧搜索结果 end -->)[//s//S]*?
(?<=</table>)").Value; MatchCollection mcpage = Tools.MatchCollection(page,"<a[^>]*?>[//s //S]*?
</a>"); //遍历每个页码,替换为本系统的形式 foreach (Match mt in mcpage) { string s2 = mt.Value; s2 = Tools.Replace(s2,@"(?<=href=/s/?)[^/s>]*").Value; page = page.Replace("/s?" + s2,"pageSite"); //插入分页导航 pageNv.InnerText = page; //赋予导航内容 return document; } } //雅虎类 yahoo.cs using System; using System.Collections.Generic; using System.Text; using System.Xml; using System.Text.RegularExpressions; public class Yahoo : ISearch { public override System.Xml.XmlDocument Search() { string xmlstr = GetPageString(); //获取搜索页 xmlstr = Tools.Replace(xmlstr,"(?=<div class=/"pm r/">)[//s//S]*?(?=</
table>)",""); XmlDocument document = new XmlDocument(); //返回的页面 document.LoadXml("<search/>"); XmlNode head = Tools.CreateNode(document,
"body"); //记录描述 XmlNode count = Tools.CreateNode(document,"(?<=<title>雅虎搜索_)[//s//S]*?
(?=</title>)").Value; //记录总数 count.InnerText = Tools.Match(xmlstr,"(?<=共返回[^//d])[//s//S]*?(?=
项)").Value; txt.InnerText = sou; string style = Tools.Match(xmlstr,"(?<=<head>*?)[//s//S]*?(?<=</head>)").
Value;//css style = Tools.delTagArray(style,true);//删除脚本和title style = Tools.delTagArray(style,"meta",false); //删除meta style = Tools.Match(style,"(?=<style>)[//s//S]*?(?<=</style>)").Value; head.InnerText = style; MatchCollection mtc = Tools.MatchCollection(xmlstr,"(<div class=/"i/">)[//s
//S]*?(</table>)"); //遍历每个结果,把搜索结果插入xml文档中 foreach (Match mt in mtc) { XmlNode item = Tools.CreateNode(document,"item"); string link = Tools.Match(mt.Value,"(?<=<div class=/"i/">)[//s//S]*?
(?=<table cellspacing=/"0/">)").Value; string ul_li = Tools.Match(link,"(?<=href=[/"]?).*?(?=[/"]?[//s>])").
Value; if (ul_li != "") { //信息节点 XmlNode url = Tools.CreateNode(document,"title"); url.InnerText = Tools.Replace(ul_li,""); string u_t = Tools.delTagArray(link,false); title.InnerText = Tools.delHtml(u_t); string de = Tools.Match(mt.Value,"(?=<td class=/"d/">)[//s//S]
*?(?<=<div class=/"rel/">)").Value; de = Tools.delHtml(de); desc.InnerText = de; XmlNode begin = Tools.CreateNode(document,"begin"); //单条搜索结果的开始 begin.InnerText = "<div class=/"i/">"; XmlNode end = Tools.CreateNode(document,"end"); //单条搜索结果的结束 end.InnerText = "</div>"; XmlNode sour1 = Tools.CreateNode(document,"sour"); //单条结果的内容 sour1.InnerText = Tools.Replace(link,"sour"); string sourstr2 = Tools.Match(mt.Value,"(?=<table cellspacing
=/"0/">)[//s//S]*?(?<=</table>)").Value; sourstr2 = Tools.Replace(sourstr2,"(<a[^>]*?)[//s//S]*?(?<=
-&nbsp;)",Tools.Replace(sourstr2,
"(?=onclick=)[//s//S]*?(?<=;/")","")); } } string page = Tools.Match(xmlstr,"(<div id=/"pg/">)[//s//S]*?(?<=</div>)").
Value;//分页 MatchCollection mcpage = Tools.MatchCollection(page,"(?=<a[//s//S]*?>)
[//s//S]*?(?<=</a>)"); //遍历每个页码,替换为本系统的形式 foreach (Match mt in mcpage) { string s2 = mt.Value; s2 = Tools.Replace(s2,"pageSite"); //插入分页导航 pageNv.InnerText = page; //赋予分页导航内容 return document; } } ////中搜 zhongsou.cs using System; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; using System.Xml; public class Zhongsou : ISearch { public override System.Xml.XmlDocument Search() { string xmlstr = GetPageString(); //获取搜索页 XmlDocument document = new XmlDocument(); //返回的xml document.LoadXml("<search/>"); XmlNode head = Tools.CreateNode(document,
"head");//创建头部 XmlNode body = Tools.CreateNode(document,
"body");//创建body //搜索记录数描述 XmlNode count = Tools.CreateNode(document,"count"); XmlNode txt2 = Tools.CreateNode(document,"(?<=<title>中搜网页_)[//s//S]*? (?=</
title>)").Value; string count2 = Tools.Match(xmlstr,"(?<=找到)[//s//S]*?(?=条结果)").Value; //搜索结果总数 count.InnerText = count2; txt2.InnerText = sou; string style = Tools.Match(xmlstr,"(?=<head>)[//s//S]*?(?<=</head>)").
Value; //样式 style = Tools.Match(style,true); head.InnerText = style; MatchCollection mtc = Tools.MatchCollection(xmlstr,"(?=<table cellspacing
=/"0/" cellpadding=/"0/">)[//s//S]*?(?<=</table>)"); //遍历每个结果,把搜索结果插入xml文档中 foreach (Match mt in mtc) { XmlNode item = Tools.CreateNode(document,"item"); //在xml中插入一条搜索记录 XmlNode url = Tools.CreateNode(document,"url");//单条超链接 XmlNode desc = Tools.CreateNode(document,"desc"); //单条搜索结果的描述 XmlNode title = Tools.CreateNode(document,"title"); //单条搜索结果的标题 MatchCollection itemc = Tools.MatchCollection(mt.Value,"[//s//S]*?
<br[^>]*?>"); string ul = Tools.Match(itemc[0].Value,"<a[//s//S]*?</a>").Value; string ul_li = Tools.Match(ul,false); title.InnerText = Tools.delHtml(u_t); //删除html if (itemc.Count > 1) { ul = Tools.delHtml(itemc[1].Value); desc.InnerText = ul; //获取描述信息 } //显示信息 XmlNode sour1 = Tools.CreateNode(document,"sour"); string sourstr = Tools.Match(itemc[0].Value,"<a[//s//S]*?</a>").
Value; sour1.InnerText = Tools.Replace(sourstr,"(?=onmousedown=)[//s//S]*?
(?<=//)/")","sour"); string txt = Tools.Match(mt.Value,"(?=<td[^>]*?)[//s//S]*?(?<=</td>)").
Value; txt = Tools.Replace(txt,"<a[//s//S]*?</a>",""); //删除超链接 txt = Tools.delTagArray(txt,false); sour2.InnerText = txt; //单条记录的内容 XmlNode begin = Tools.CreateNode(document,"begin"); //单条记录的开始 begin.InnerText = "<table cellspacing=/"0/" cellpadding=/"0/"><tr><td
class=/"f/">"; XmlNode end = Tools.CreateNode(document,"end");//单条记录的结束 end.InnerText = "</td></tr></table>"; } string page = Tools.Match(xmlstr,"(<table ><tr><td class=db>)[//s//S]*?
(</table>)").Value; MatchCollection mcpage = Tools.MatchCollection(page,"<a[^>]*?>[//s//S]*?
</a>"); //遍历每个页码,替换为本系统的形式 foreach (Match mt in mcpage) { string s2 = mt.Value; s2 = Tools.Replace(s2,@"(?<=href=p/?)[^/s>]*").Value; page = page.Replace("p?" + s2,"pageSite");//插入分页导航 pageNv.InnerText = page; //赋予导航内容 return document; } }

(编辑:安卓应用网)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读