【www.bbyears.com--Google】
不过千万别以为Lucene是一个象google那样的搜索引擎,Lucene甚至不是一个应用程序,它仅仅是一个工具,一个Library.你也可以把它理解为一个将索引,搜索功能封装的很好的一套简单易用的API.利用这套API你可以做很多有关搜索的事情,而且很方便.
Lucene可以对任何的数据做索引和搜索. Lucene不管数据源是什么格式,只要它能被转化为文字的形式,就可以被Lucene所分析利用.也就是说不管是MS word, Html ,pdf还是其他什么形式的文件只要你可以从中抽取出文字形式的内容就可以被Lucene所用.你就可以用Lucene对它们进行索引以及搜索.
1、首先看一下代码结构
程序分为3部分,lucenetest是入库程序,由外部提供的文本,进行读取,入库;
Pangu.Lucene.Analyzer是一个分析器,基于盘古分词工具;
Website是对外提供的服务接口,对索引数据进行开放;
2、分析文本,入库建立索引:
官方例子
1. 建立索引
IndexWriter writer = new IndexWriter("index", new StandardAnalyzer(), true);
IndexDocs(writer, new System.IO.FileInfo(args[0]));
writer.Optimize();
writer.Close();
IndexWriter是对索引进行写操作的一个类,利用它可以创建一个索引对象然后往其中添加文件.需要注意它并不是唯一可以修改索引的类.在索引建好后利用其他类还可以对其进行修改.
构造函数第一个参数是建立的索引所要放的文件夹的名字.第二个参数是一个分析对象,主要用于从文本中抽取那些需要建立索引的内容,把不需要参与建索引的文本内容去掉.比如去掉一些a the之类的常用词,还有决定是否大小写敏感.不同的选项通过指定不同的分析对象控制.第三个参数用于确定是否覆盖原有索引的.
第二步就是利用这个writer往索引中添加文件.具体后面再说.
第三步进行优化.
第四步关闭writer.
项目代码:
using Baitone.DSP.ClearModel;
using Lucene.Net.Analysis.PanGu;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Store;
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Lucenetest
{
public static class CreateIndex
{
public static string indexPath = System.Configuration.ConfigurationManager.AppSettings["indexPath"];
public static string logpath = System.Configuration.ConfigurationManager.AppSettings["logpath"];
static ArrayList GetAll(DirectoryInfo dir, ArrayList list)//搜索文件夹中的文件
{
FileInfo[] allFile = dir.GetFiles();
foreach (FileInfo fi in allFile)
{
list.Add(fi);
}
DirectoryInfo[] allDir = dir.GetDirectories();
foreach (DirectoryInfo d in allDir)
{
GetAll(d, list);
}
return list;
}
public static void main()
{
DirectoryInfo logdir = new DirectoryInfo(logpath);
if (!System.IO.Directory.Exists(indexPath))
{
System.IO.Directory.CreateDirectory(indexPath);
}
ArrayList Flst = new ArrayList();
GetAll(logdir, Flst);
foreach (FileInfo file in Flst)
{
Console.WriteLine("开始" + file.FullName + " 分析." + "\n");
CRUDIndex(file.FullName);
Console.WriteLine("已完成" + file.FullName + " 分析.数量:" + "\n");
}
}
///
/// 更新索引库操作
///
private static void CRUDIndex(string path)
{
if (File.Exists(path) == false)
{
Console.WriteLine(path + " Log文件不存在" + DateTime.Now.ToString());
return;
}
//List
DateTime dtB = DateTime.Now;
Console.WriteLine(dtB.ToString("yyyy-MM-dd HH:mm:ss") + " 开始读 " + path + " ....");
StringBuilder sb = new StringBuilder();
StreamReader sr = new StreamReader(path, Encoding.UTF8);
String line;
List
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
bool isExist = IndexReader.IndexExists(directory);
if (isExist)
{
if (IndexWriter.IsLocked(directory))
{
IndexWriter.Unlock(directory);
}
}
IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isExist, IndexWriter.MaxFieldLength.UNLIMITED);
while ((line = sr.ReadLine()) != null)
{
if (path.IndexOf("log_Tags") > 0)//标签
{
Tags _temp = Newtonsoft.Json.JsonConvert.DeserializeObject
Document document = new Document();
document.Add(new Field("gid", _temp.gid.ToString(), Field.Store.YES, Field.Index.ANALYZED));
document.Add(new Field("andId", _temp.andId, Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.Add(new Field("dpidsha1", _temp.dpidsha1, Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.Add(new Field("flag", _temp.flag.ToString(), Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.Add(new Field("macsha1", _temp.macsha1, Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.Add(new Field("mac", _temp.mac, Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.Add(new Field("make", _temp.make, Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.Add(new Field("md", _temp.md, Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.Add(new Field("content", line, Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
writer.AddDocument(document);
}
else if (path.IndexOf("log_base") > 0)//标签
{
BaseInfo _temp = Newtonsoft.Json.JsonConvert.DeserializeObject
Document document = new Document();
document.Add(new Field("gid", _temp.gid.ToString(), Field.Store.YES, Field.Index.ANALYZED));
document.Add(new Field("ip", _temp.ip.ToString(), Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.Add(new Field("lat", _temp.lat.ToString(), Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.Add(new Field("lon", _temp.lon.ToString(), Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
if (_temp.eventtime != null) {
document.Add(new Field("eventtime", _temp.eventtime, Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
}
document.Add(new Field("appname", _temp.appname, Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.Add(new Field("content", line, Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
writer.AddDocument(document);
}
}
writer.Dispose();
directory.Dispose();
}
}
}
配置文件,配置目录:
3、对外web服务:
提供查询关键字,进行索引的查询;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
using Lucene.Net.Store;
using PanGu;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using System.Web;
using System.Web.Services;
///
/// s 的摘要说明
///
[WebService(Namespace = "http://tempuri.org/")]
[WebServiceBinding(ConformsTo = WsiProfiles.BasicProfile1_1)]
// 若要允许使用 ASP.NET AJAX 从脚本中调用此 Web 服务,请取消注释以下行。
// [System.Web.Script.Services.ScriptService]
public class s : System.Web.Services.WebService
{
public s()
{
//如果使用设计的组件,请取消注释以下行
//InitializeComponent();
}
public static string indexPath = System.Configuration.ConfigurationManager.AppSettings["indexPath"];
[WebMethod]
public string search(string keyword)
{
if (keyword != null && keyword != "")
{
var watch = Stopwatch.StartNew();
Analyzer analyzer = null;
analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
//搜索
IndexSearcher searcher = new IndexSearcher(FSDirectory.Open(new DirectoryInfo(indexPath)), true);
string[] fields = { "gid", "dpidsha1","content" };
//查询表达式
MultiFieldQueryParser queryP = new MultiFieldQueryParser(Lucene.Net.Util.Version.LUCENE_30, fields, analyzer);
//query.parse:注入查询条件
Query query = queryP.Parse(keyword);
var hits = searcher.Search(query, 200);
//PanGu create highlighter
PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter =
new PanGu.HighLight.SimpleHTMLFormatter("", "");
PanGu.HighLight.Highlighter highlighter =
new PanGu.HighLight.Highlighter(simpleHTMLFormatter,
new Segment());
highlighter.FragmentSize = 50;
StringBuilder sb = new StringBuilder();
for (int i = 0; i < hits.TotalHits; i++)
{
Document doc = searcher.Doc(hits.ScoreDocs[i].Doc);
sb.Append(doc.Get("content")+"/r/n/
");
// TokenStream stream = analyzer.TokenStream("goods_name", new StringReader(doc.Get("goods_name")));
// String sample = highlighter.GetBestFragment(stream, doc.Get("goods_name"), 2, "...");
}
watch.Stop();
return sb.ToString();
}
else
{
return "";
}
}
}
目前是简单的例子。后续将继续发布深入的demo。