Java应用开源框架实现简易web搜索引擎

发布时间：2020-05-23 16:07:42 所属栏目：Java 来源：互联网

导读：引言应用Java的开源库，编写一个搜索引擎，这个引擎能爬取一个网站的内容。并根据网页内容进行深度爬取，获取所有相关的网页地址和内容，用户可以通过关键词，搜索所有相关的网址。

引言

应用 Java 的开源库，编写一个搜索引擎，这个引擎能爬取一个网站的内容。并根据网页内容进行深度爬取，获取所有相关的网页地址和内容，用户可以通过关键词，搜索所有相关的网址。

具体功能

(1) 用户可以指定爬取一个url对应的网页的内容。
(2) 对网页内容进行解析，并获取其中所有的url链接地址。
(3) 用户可以设定爬取深度，代表着从初始url对应的页面开始，可以爬取其中所有的url对应的网页内的url，以此类推。深度越大，能爬取到的网站越多。
(4) 对爬取到的url内容进行保存、建立索引。建立索引的内容是url地址本身，和url对应的网页标题。
(5) 用户可以通过关键词对网址进行搜索，找出有该关键词的url地址。
(6) 建立索引和搜索索引的过程能智能识别中文关键词，能对关键词进行分词操作。
(7) 用户可以指定保存索引的地址、初始url、爬取深度、进行搜索的关键词和最大匹配项。

开源框架

Lucene
Jsoup

源码

爬虫部分：Spider.java

package webCrawler.Spider;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Scanner;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import webCrawler.Index.BuildIndex;

/**
 * @author lannooo
 */

public class Spider {
  ArrayList<String> URLs;
  private String startURL;
  private int digLevel;

  /**
   * @param startURL 爬虫的起始URL
   * @param digLevel 爬取深度
   */
  public Spider(String startURL,int digLevel){
    this.startURL = startURL;
    this.digLevel = digLevel;
    this.URLs = new ArrayList<>();
  }

  /**
   * @param level 当前爬取的深度剩余
   * @param arrayList 需要进行下一轮爬去的URL集
   * @return 从一格url集爬取到的新的URL集
   * @throws IOException
   */
  public ArrayList<String> getLevelURLs(int level,ArrayList<String> arrayList) 
      throws IOException{
    ArrayList<String> total = null;
    if(level>0){      
      total = new ArrayList<>();
      for(String url: arrayList){
        /*对于每个arrayList中的URL，首先解析其网页内容，并获得里面所有URL项*/
        for(String each: getBareLinks(url)){
          total.add(each);
        }
      }
      /*用HashSet这个容器将total里面重复项删除*/
      HashSet<String> hashSet = new HashSet<>(total);
      total = new ArrayList<>(hashSet);
    }
    return total;
  }

  /**
   * 从startURL开始，爬取所有相关URLs
   * @throws IOException
   */
  public void getAll() throws IOException{
    ArrayList<String> newURLs;
    ArrayList<String> currentURLs = new ArrayList<>();
    /*把startURL加入currentURLs这个列表中，从这个url开始爬*/
    currentURLs.add(startURL);
    for(int i=digLevel; i>0; i--){
      /*
       * 对于每一层，都要获取一次由这个url引申出去的url集
       * 然后把当前集的已经爬去过的url加入到总的URL集中
       * 最后newURLs作为新的需要进行深度爬取的集进入下一轮循环
       */
      System.out.println("Dig into level: " + (digLevel-i+1));
      newURLs = getLevelURLs(i,currentURLs);
      for(String each: currentURLs){
        URLs.add(each);
      }
      currentURLs = newURLs;
    }
    for(String each:currentURLs){
      URLs.add(each);
    }
    HashSet<String> hashSet = new HashSet<>(URLs);
    URLs = new ArrayList<>(hashSet);
  }

  /**
   * @param path 保存索引的路径
   * @throws IOException
   */
  public void storeURLsAndInfo(String path) throws IOException{
    BuildIndex build = new BuildIndex(path);
    /* 把URLs中的所有url进行实际网页标题的爬取*/
    for(String each:URLs){
      String text = getLinkText(each);
      if(text!=null){
        build.addField("url",each);
        build.addField("text",text);
        /*将这一个entry加入索引中*/
        build.pushIndex();
      }
    }
    build.close();
  }

  /**
   * @param url 需要获取网页标题的url
   * @return 标题内容
   * @throws IOException
   */
  public String getLinkText(String url) throws IOException{
    Document document = null;
    try {
      /*用Jsoup进行连接，设置超时时间为3秒*/
      document = Jsoup.connect(url).timeout(3000).get();
    } catch (Exception e) {
      System.out.println("[TIMEOUT]Get title of url:"+url);
      return null;
    }
    String title = document.title();
    return title;
  }


  /**
   * @param url 进行内容解析的url
   * @return 返回该url的网页内容内的所有urls列表
   * @throws IOException
   */
  public ArrayList<String> getBareLinks(String url) throws IOException{
    ArrayList<String> linksList = new ArrayList<>();
    Document document;

    try {
      document = Jsoup.connect(url).timeout(2000).get();
    } catch (Exception e) {
      return linksList;
    }
    /*获取<body>标签理的所有带href属性的<a>标签*/
    Elements links = document.select("body").select("a[href]");

    for(Element link: links){
      /*从每一个解析得到的<a>标签中提取url，并去除锚点*/
      String href = link.attr("abs:href").replaceAll("#","");
      /*只添加含有zju.edu.cn字符的url，去除末尾的'/'*/
      if(href.contains("zju.edu.cn")){
        if (href.endsWith("/")){
          href = href.substring(0,href.length()-1);
        }
        linksList.add(href);
      }
    }
    HashSet<String> hashSet = new HashSet<>(linksList);
    ArrayList<String> arrayList = new ArrayList<>(hashSet);

    return arrayList;
  }

  public static void main(String[] args) {
    Scanner in = new Scanner(System.in);
    System.out.println("Enter url:");
    String url = in.nextLine().trim();
    while(!url.startsWith("http://")){
      System.out.println("http:// is needed!");
      System.out.println("Enter url:");
      url = in.nextLine().trim();
    }
    System.out.println("Enter depth to dig more urls[<=3 recommended]：");
    int depth = in.nextInt();
    Spider spider = new Spider(url,depth);
    System.out.println("Enter path you want to save[default=d:/index-spider]:");
    String path = in.nextLine().trim();
    if(path.length()==0){
      path = "d:/index-spider";
    }
    try {
      System.out.println("Start fetching...");
      spider.getAll();
      System.out.println("Urls got success!");
      spider.storeURLsAndInfo(path);
      System.out.println("Stored success!");
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

 
}

建立索引：BuildIndex.java

package webCrawler.Index;

import java.io.*;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

/**
 * @author lannooo
 *
 */
public class BuildIndex {
  private File file;
  private Directory directory;
  private IndexWriter indexWriter;
  private IndexWriterConfig config;
  private Analyzer analyzer;
  private Document document;

  /**
   * @param path 建立索引的路径
   */
  public BuildIndex(String path) {
    try {
      file = new File(path);
      directory = FSDirectory.open(file);
      document = new Document();
      analyzer = new IKAnalyzer();    /*中文分词工具类*/
      config = new IndexWriterConfig(Version.LUCENE_4_10_0,analyzer);
      indexWriter = new IndexWriter(directory,config);      

    } catch (Exception e) {
      e.printStackTrace();
    }
  }

  /**
   * @param fieldName 加入到document中的新的一项的名称
   * @param fieldText 新的一项的内容
   */
  public void addField(String fieldName,String fieldText){
    try{
      Field field = new TextField(fieldName,fieldText,Field.Store.YES);
      document.add(field);
    }catch (Exception e) {
      e.printStackTrace();
    }
  }

  /**
   * 将document加入到索引中
   */
  public void pushIndex(){
    try {
      indexWriter.addDocument(document);
      document = new Document();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

  /**
   * 加入完整的一个document并保存到索引中
   * @param url 加入的url地址
   * @param text url对应的文本
   */
  public void addOneIndex(String url,String text){
    this.addField("url",url);
    this.addField("text",text);
    this.pushIndex();
  }

  /**
   * 关闭索引写入
   */
  public void close(){
    try {
      indexWriter.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

}

（编辑：安卓应用网）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!