1. 程式人生 > >算法之智能搜索(上)

算法之智能搜索(上)

rdma fff () setname sets exce skip entry water

筆者並不了解,各大搜索網站是怎麽實現智能搜索的。以下只是筆者一時的想法,筆者覺得這個方法可以實現智能匹配搜索內容。

一、首先我們獲取細胞詞庫內容

①建表語句:

DROP TABLE IF EXISTS `sougou_ciku`;
CREATE TABLE `sougou_ciku` (
    `id` varchar(50) NOT NULL,
    `text` varchar(100) NOT NULL,
    `below` varchar(50) default NULL,
    `remark` varchar(100) default NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

②創建映射實體類:

package com.css.java.learning.model;
public class SouGouCiKu {
private String id;//主鍵
private String text; //內容 
private String below;//所屬
private String remark;//備註

public String getId() {
    return id;
}
public void setId(String id) {
    this.id = id;
}
public String getText() {
    return text;
}
public void setText(String text) {
    this.text = text;
}
public String getBelow() {
    return below;
}
public void setBelow(String below) {
    this.below = below;
}
public String getRemark() {
    return remark;
}
public void setRemark(String remark) {
    this.remark = remark;
}

}

③創建搜狗scel文件閱讀器:

package com.css.java.learning.massbag;
import java.util.List;
import java.util.Map;
public class SougouScelMdel {
        private Map<String, List<String>> wordMap;
        private String name;
        private String type;
        private String description;
        private String sample;
public Map<String, List<String>> getWordMap() {
    return wordMap;
}
void setWordMap(Map<String, List<String>> wordMap) {
    this.wordMap = wordMap;
}
public String getType() {
    return type;
}
public void setType(String type) {
    this.type = type;
}
public String getDescription() {
    return description;
}
public void setDescription(String description) {
    this.description = description;
}
public String getSample() {
    return sample;
}
public void setSample(String sample) {
    this.sample = sample;
}
public String getName() {
    return name;
}
public void setName(String name) {
    this.name = name;
}

}

package com.css.java.learning.massbag;
import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
public class SougouScelReader {
public SougouScelMdel read(File file) throws IOException {
    return read(new FileInputStream(file));
}
public SougouScelMdel read(URL url) throws IOException {
    return read(url.openStream());
}
protected ByteArrayOutputStream output=new ByteArrayOutputStream();

protected String readString(DataInputStream input,int pos,int[] reads) throws IOException {
    int read=reads[0];
    input.skip(pos-read);
    read=pos;
    output.reset();
    while(true) {
        int c1 = input.read();
        int c2 = input.read();
        read+=2;
        if(c1==0 && c2==0) {
            break;
        } else {
            output.write(c1);
            output.write(c2);
        }
    }
    reads[0]=read;
    return new String(output.toByteArray(),encoding);
}

protected static String encoding = "UTF-16LE";

public SougouScelMdel read(InputStream in) throws IOException {
    SougouScelMdel model = new SougouScelMdel();
    DataInputStream input = new DataInputStream(in);
    int read;
    try {
        byte[] bytes = new byte[4];
        input.readFully(bytes);
        assert (bytes[0] == 0x40 && bytes[1] == 0x15 && bytes[2] == 0 && bytes[3] == 0);
        input.readFully(bytes);
        int flag1 = bytes[0];
        assert (bytes[1] == 0x43 && bytes[2] == 0x53 && bytes[3] == 0x01);
        int[] reads=new int[]{8};
        model.setName(readString(input,0x130,reads));
        model.setType(readString(input,0x338,reads));
        model.setDescription(readString(input,0x540,reads));
        model.setSample(readString(input,0xd40,reads));
        read = reads[0];
        input.skip(0x1540 - read);
        read=0x1540;
        input.readFully(bytes);
        read += 4;
        assert (bytes[0] == (byte) 0x9D && bytes[1] == 0x01 && bytes[2] == 0 && bytes[3] == 0);
        bytes = new byte[128];
        Map<Integer, String> pyMap = new LinkedHashMap<Integer, String>();
        while (true) {
            int mark = readUnsignedShort(input);
            int size = input.readUnsignedByte();
            input.skip(1);
            read += 4;
            assert (size > 0 && (size % 2) == 0);
            input.readFully(bytes, 0, size);
            read += size;
            String py = new String(bytes, 0, size, encoding);
            pyMap.put(mark, py);
            if ("zuo".equals(py)) {
                break;
            }
        }
        if (flag1 == 0x44) {
            input.skip(0x2628 - read);
        } else if (flag1 == 0x45) {
            input.skip(0x26C4 - read);
        }
        StringBuffer buffer = new StringBuffer();
        Map<String, List<String>> wordMap = new LinkedHashMap<String, List<String>>();
        while (true) {
            int size = readUnsignedShort(input);
            if (size < 0) {
                break;
            }
            int count = readUnsignedShort(input);
            int len = count / 2;
            assert (len * 2 == count);
            buffer.setLength(0);
            for (int i = 0; i < len; i++) {
                int key = readUnsignedShort(input);
                buffer.append(pyMap.get(key)).append("‘");
            }
            buffer.setLength(buffer.length() - 1);
            String py = buffer.toString();
            List<String> list = wordMap.get(py);
            if (list == null) {
                list = new ArrayList<String>();
                wordMap.put(py, list);
            }
            for (int i = 0; i < size; i++) {
                count = readUnsignedShort(input);
                if (count > bytes.length) {
                    bytes = new byte[count];
                }
                input.readFully(bytes, 0, count);
                String word = new String(bytes, 0, count, encoding);
                input.skip(12);
                list.add(word);
            }
        }
        model.setWordMap(wordMap);
        return model;
    } finally {
        in.close();
    }
}
protected final int readUnsignedShort(InputStream in) throws IOException {
    int ch1 = in.read();
    int ch2 = in.read();
    if ((ch1 | ch2) < 0) {
        return Integer.MIN_VALUE;
    }
    return (ch2 << 8) + (ch1 << 0);
}

}

④搜狗官網下下載細胞詞庫.scel文件
略!

⑤讀取細胞詞庫文件.scel插入數據庫

private static void sogou(String path) throws IOException{  
            File file=new File(path);  
            SougouScelMdel model = new SougouScelReader().read(file);  
            Map<String,List<String>> words = model.getWordMap(); //詞<拼音,詞>  
            Set<Entry<String,List<String>>> set = words.entrySet();  
            Iterator<Entry<String,List<String>>> iter = set.iterator();  
            while(iter.hasNext()){  
                    Entry<String,List<String>> entry = iter.next();  
                    List<String> list = entry.getValue();  
                    int size = list.size();  
                    for(int i = 0; i < size; i++){  
                            String word = list.get(i); 
                            /*判斷,該詞是否在數據庫中出現,無則加之有則不做處理
                             * 此處方法不做呈現
                             */
                            boolean is_exit = jugeWord(word);
                            if(is_exit) {
                                /*將該詞,插入到數據庫中,供後續使用
                                 * 此方法亦不做呈現
                                 */
                                insert(word);
                            }
                            System.out.println(word);  
                    }  
            }  
    } 

⑥執行搜狗細胞詞庫插入數據庫

筆者以下面的文件為例:

技術分享圖片

得到以下等數據

技術分享圖片

下篇講解,筆者自創的簡單算法,拆分輸入語句匹配詞庫完成搜索過程。

算法之智能搜索(上)