基於spring boot架構和word分詞器的分詞檢索,排序,分頁實現
阿新 • • 發佈:2019-01-11
本文不適合Java初學者,適合對spring boot有一定了解的同學。 文中可能涉及到一些實體類、dao類、工具類文中沒有這些類大家不必在意,不影響本文的核心內容,本文重在對方法的梳理。
word分詞器maven依賴
<dependency>
<groupId>org.apdplat</groupId>
<artifactId>word</artifactId>
<version>1.3</version>
</dependency>
spring boot的常見依賴在這裡我就不列舉了可以見文章先構建一個PageUtil類用於封裝分頁排序方法。
現在構建一個SearchService請看下面程式碼,package com.frank.demo.util; import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; public class PageUtil { // 分頁方法 public static <T> List<T> splitList(List<T> list, int pageSize, int curPage) { List<T> subList = new ArrayList<T>(); int listSize = list.size(); int star = pageSize * curPage; int end = pageSize * (curPage + 1); if (end > listSize) { end = listSize; } if (star >= listSize) { return new ArrayList<T>(); } for (int i = star; i < end; i++) { subList.add(list.get(i)); } return subList; } // 排序(搜尋內容按照相似度高低排序) private static void comparator(List<EtlSearchCompanyResponseDto> data) { Collections.sort(data, new Comparator<EtlSearchCompanyResponseDto>() { @Overridepublic int compare(EtlSearchCompanyResponseDto o1, EtlSearchCompanyResponseDto o2) { int cp = 0; if (o1.getMatching() > o2.getMatching()) { cp = -1; } else if (o1.getMatching() < o2.getMatching()) { cp = 1; } return cp; } }); } }
package com.frank.demo.service; //java內部工具 import java.util.Collections; import java.util.Comparator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; //基於spring boot整合hibernate的標準查詢 import javax.persistence.criteria.CriteriaBuilder; import javax.persistence.criteria.CriteriaQuery; import javax.persistence.criteria.Predicate; import javax.persistence.criteria.Root; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.data.domain.Sort; import org.springframework.data.domain.Sort.Direction; import org.springframework.data.jpa.domain.Specification; import org.springframework.stereotype.Service; // 分詞器 import org.apdplat.word.WordSegmenter; import org.apdplat.word.segmentation.Word; //用到的dao、實體類、工具類等,本文重在方法上的理解不必在意這些輔助類 import com.frank.demo.dao.EtlDataT1004Dao; import com.frank.demo.dao.EtlDataT1009Dao; import com.frank.demo.dao.EtlDataT1022Dao; import com.frank.demo.dto.EtlCreatDueDiligenceRequestDto; import com.frank.demo.dto.EtlSearchCompanyResponseDto; import com.frank.demo.entity.EtlDataT1004; import com.frank.demo.entity.EtlDataT1009; import com.frank.demo.entity.EtlDataT1022; import com.frank.demo.util.api.ApiResponse; import com.frank.demo.util.dto.v1.PageRequestDto; import com.frank.demo.util.PageUtil; @Service public class SearchService { @Autowired EtlDataT1004Dao etlDataT1004Dao; @Autowired EtlDataT1009Dao etlDataT1009Dao; @Autowired EtlDataT1022Dao etlDataT1022Dao; private List<Word> words; //本例是多資料來源搜尋,所以採用的是從三張表中獲取相似公司名稱的記錄,再計算每條記錄的相似度,最後統一放到list集合進行排序,最後採用記憶體分頁返回(提示在資料量不是特別大的情景下可以這麼做,如果資料量上百萬,建議採用搜尋引擎實現) public Map<String, Object> searchCompany(EtlCreatDueDiligenceRequestDto request, PageRequestDto page) { Map<String, Object> response = new LinkedHashMap<String, Object>(); response.put(ApiResponse.KEY_MESSAGE, ApiResponse.MESSAGE_OK); List<EtlSearchCompanyResponseDto> data = new LinkedList<>(); // 採用分詞檢索按照相似度高低進行排序(資料來源於三個地方,上交所,深交所,中小型企業股權轉讓系統) words = WordSegmenter.segWithStopWords(request.getCompanyName());//通過word分詞器獲取分詞結果 Sort shsort = new Sort(Direction.ASC,"f8");//列用資料庫對匹配結果進行一次排序 List<EtlDataT1004> shdatas = etlDataT1004Dao.findAll(new Specification<EtlDataT1004>() { @Override public Predicate toPredicate(Root<EtlDataT1004> root, CriteriaQuery<?> query, CriteriaBuilder cb) { List<Predicate> predicates = new LinkedList<>(); for (Word word : words) { predicates.add(cb.like(root.get("f8").as(String.class), "%" + word.getText() + "%")); } Predicate[] p = new Predicate[predicates.size()]; return cb.or(predicates.toArray(p)); } },shsort); // 匹配度計算 for (EtlDataT1004 t1004 : shdatas) { EtlSearchCompanyResponseDto responseDto = new EtlSearchCompanyResponseDto(t1004.getF8().split("/")[0], t1004.getF8().split("/")[1], t1004.getF1(), "1", t1004.getF9()); int i = 0; for (Word word : words) { if (t1004.getF8().contains(word.getText())) { i++; } } responseDto.setCompanyLegal(t1004.getF11()); responseDto.setMatching(i); data.add(responseDto); } Sort szsort = new Sort(Direction.ASC,"f3"); List<EtlDataT1009> szDatas = etlDataT1009Dao.findAll(new Specification<EtlDataT1009>() { @Override public Predicate toPredicate(Root<EtlDataT1009> root, CriteriaQuery<?> query, CriteriaBuilder cb) { List<Predicate> predicates = new LinkedList<>(); for (Word word : words) { predicates.add(cb.or(cb.like(root.get("f3").as(String.class), "%" + word.getText() + "%"))); predicates.add(cb.or(cb.like(root.get("f4").as(String.class), "%" + word.getText() + "%"))); } Predicate[] p = new Predicate[predicates.size()]; return cb.or(predicates.toArray(p)); } },szsort); // 匹配度計算 for (EtlDataT1009 t1009 : szDatas) { EtlSearchCompanyResponseDto responseDto = new EtlSearchCompanyResponseDto(t1009.getF3(), t1009.getF4(), t1009.getF1(), "2", t1009.getF5()); int i = 0; for (Word word : words) { if (t1009.getF3().contains(word.getText())) { i++; } else if (t1009.getF4().contains(word.getText())) { i++; } } responseDto.setMatching(i); data.add(responseDto); } Sort gzsort = new Sort(Direction.ASC,"f11"); List<EtlDataT1022> gzDatas = etlDataT1022Dao.findAll(new Specification<EtlDataT1022>() { @Override public Predicate toPredicate(Root<EtlDataT1022> root, CriteriaQuery<?> query, CriteriaBuilder cb) { List<Predicate> predicates = new LinkedList<>(); for (Word word : words) { predicates.add(cb.or(cb.like(root.get("f11").as(String.class), "%" + word.getText() + "%"))); predicates.add(cb.or(cb.like(root.get("f12").as(String.class), "%" + word.getText() + "%"))); } Predicate[] p = new Predicate[predicates.size()]; return cb.or(predicates.toArray(p)); } },gzsort); // 匹配度計算 for (EtlDataT1022 t1022 : gzDatas) { EtlSearchCompanyResponseDto responseDto = new EtlSearchCompanyResponseDto(t1022.getF11(), t1022.getF12(), t1022.getF1(), "3", t1022.getF14()); int i = 0; for (Word word : words) { if (t1022.getF11().contains(word.getText())) { i++; } else if (t1022.getF12().contains(word.getText())) { i++; } } responseDto.setCompanyLegal(t1022.getF15()); responseDto.setMatching(i); data.add(responseDto); } // 排序分頁 PageUtil.searchCompanyComparator(data); List<EtlSearchCompanyResponseDto> pages = PageUtil.splitList(data, page.getSize(), page.getPage()-1); response.put(ApiResponse.KEY_DATA, pages); Map<String, Object> pageMap = new LinkedHashMap<>(); int size = data.size() / page.getSize(); if (data.size() % page.getSize() != 0) { size++; } pageMap.put("pageCount", size); response.put(ApiResponse.KEY_PAGE, pageMap); return response; } }
使用word分詞器的朋友給個提醒,word分詞器初次呼叫時會載入詞庫,所以建議大家在專案啟動的時候預設去呼叫以下分詞器的介面,這便於你在使用分詞的時候不會等待很長時間,正常載入本例經測試10萬級別的資料返回時間是1s內。
有疑問的朋友可以在評論中留言了,看到會第一時間回覆!
喜歡朋友可以關注我的個人微信公眾號哦,會同步更新相應技術,二維碼見下圖。
萌萌技術