Nutch2.3.1原始碼開發環境搭建
阿新 • • 發佈:2019-02-10
原始碼下載
修改配置檔案
修改配置檔案conf/nutch-site.xml
<!-- Put site-specific property overrides in this file. -->
<configuration>
<!--此引數主要用於在IDE環境開發模式執行,在構建輸出的runtime部署執行請註釋或刪除此項引數-->
<!-- Just for development, please remove this plugin.folders for production env -->
<property>
<name>plugin.folders</name>
<value>./src/plugin</value>
</property>
<!--基於gora的爬蟲資料底層儲存機制,-->
<!--官方文件及推薦為HBase,本專案預設配置為MongoDB。需要同步配置gora.properties檔案中相關引數。-->
<property>
<name >storage.data.store.class</name>
<value>org.apache.gora.mongodb.store.MongoStore</value>
<description>Default class for storing data</description>
</property>
<property>
<name>http.agent.name</name >
<value>Your Nutch Spider</value>
</property>
</configuration>
修改ivy/ivy.xml檔案 取消mongodb註釋
<!-- Uncomment this to use MongoDB as Gora backend. -->
<dependency org="org.apache.gora" name="gora-mongodb" rev="0.6.1" conf="*->default" />
修改conf/gora.properties檔案配置mongodb
############################
# MongoDBStore properties #
############################
gora.datastore.default=org.apache.gora.mongodb.store.MongoStore
gora.mongodb.override_hadoop_configuration=false
gora.mongodb.mapping.file=/gora-mongodb-mapping.xml
gora.mongodb.servers=localhost:27017
gora.mongodb.db=nutchFocuse
#gora.mongodb.login=login
#gora.mongodb.secret=secret
編譯專案 匯入intellij idea
在該目錄下分別執行ant clean,ant,ant eclipse。執行完成後。開啟intellij idea import Project->選擇apache-nutch-2.3.1目錄->import project from external model(選擇eclipse),之後一路next即可。
調整依賴順序
調整依賴順序 1.前三個依賴順序為conf,Module source,1.8(jdk)
執行測試
在該目錄下建立資料夾urls,在資料夾下建立檔案seed.txt 該檔案用於儲存種子url。工程搭建完成後目錄結構如下圖所示:
我根據crawl指令碼“直譯”了一個java類(crawl)方便用於除錯
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.crawl;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.fetcher.FetcherJob;
import org.apache.nutch.indexer.IndexingJob;
import org.apache.nutch.indexer.solr.SolrDeleteDuplicates;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.ParserJob;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.StringUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Random;
// Commons Logging imports
public class Crawl extends Configured implements Tool {
public static final Logger LOG = LoggerFactory.getLogger(Crawl.class);
/* Perform complete crawling and indexing (to Solr) given a set of root urls and the -solr
parameter respectively. More information and Usage parameters can be found below. */
public static void main(String args[]) throws Exception {
Configuration conf = NutchConfiguration.create();
String[] parameter = new String[3];
parameter[0] = "urls";
parameter[1] = "testcrawlid";
// parameter[2] = "http://localhost:8080/solr";
// parameter[3] = "1";
parameter[2] = "1";
int res = ToolRunner.run(conf, new Crawl(), parameter);
System.exit(res);
}
@Override
public int run(String[] args) throws Exception {
if (args.length < 3) {
System.out.println
("Usage: crawl <seedDir> <crawlID> [<solrUrl>] <numberOfRounds>");
return -1;
}
String seedDir = args[0];
String crawlId = args[1];
String limit="",solrUrl="";
if (args.length==3){
limit = args[2];
}else if (args.length==4){
solrUrl = args[2];
limit = args[3];
}else {
System.out.println("引數個數不匹配,檢查輸入引數");
}
if (StringUtil.isEmpty(seedDir)){
System.out.println("Missing seedDir : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>");
}
if (StringUtil.isEmpty(crawlId)){
System.out.println("Missing crawlID : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>");
}
if (StringUtil.isEmpty(solrUrl)){
System.out.println("No SOLRURL specified. Skipping indexing.");
}
if (StringUtil.isEmpty(limit)){
System.out.println("Missing numberOfRounds : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>");
}
//MODIFY THE PARAMETERS BELOW TO YOUR NEEDS
//set the number of slaves nodes
int numSlaves=1;
//and the total number of available tasks
//sets Hadoop parameter "mapred.reduce.tasks"
int numTasks= numSlaves<<1;
// number of urls to fetch in one iteration
//250K per task?
// int sizeFetchlist=numSlaves * 5;
int sizeFetchlist=10;
//time limit for feching
String timeLimitFetch="180";
//Adds <days> to the current time to facilitate
//crawling urls already fetched sooner then
//db.default.fetch.interval.
int addDays=0;
getConf().set("mapred.reduce.tasks", String.valueOf(numTasks));
getConf().set("mapred.child.java.opts","-Xmx1000m");
getConf().set("mapred.reduce.tasks.speculative.execution","false");
getConf().set("mapred.map.tasks.speculative.execution","false");
getConf().set("mapred.compress.map.output","true");
InjectorJob injector = new InjectorJob(getConf());
GeneratorJob generator = new GeneratorJob(getConf());
FetcherJob fetcher = new FetcherJob(getConf());
ParserJob parse = new ParserJob(getConf());
DbUpdaterJob dbUpdaterJob = new DbUpdaterJob(getConf());
IndexingJob indexingJob = new IndexingJob();
SolrDeleteDuplicates solrDeleteDuplicates = new SolrDeleteDuplicates();
// initialize crawlDb
getConf().set(Nutch.CRAWL_ID_KEY, crawlId);
int res;
String[] injectParameter = new String[3];
injectParameter[0] = seedDir;
injectParameter[1] = "-crawlId";
injectParameter[2] = crawlId;
System.out.println("initial injection");
res = ToolRunner.run(getConf(), injector,injectParameter);
print(res,"inject");
for (int i = 0; i < Integer.parseInt(limit); i++) {
System.out.println("Begin Generate");
String batchId = System.currentTimeMillis()+"-"+new Random().nextInt(32767);
String[] generateParameter = new String[10];
// generate new segment
generateParameter[0] = "-topN";
generateParameter[1] = String.valueOf(sizeFetchlist);
generateParameter[2] = "-noNorm";
generateParameter[3] = "-noFilter";
generateParameter[4] = "-adddays";
generateParameter[5] = String.valueOf(addDays);
generateParameter[6] = "-crawlId";
generateParameter[7] = crawlId;
generateParameter[8] = "-batchId";
generateParameter[9] = batchId;
res = ToolRunner.run(getConf(), generator,generateParameter);
print(res,"generate");
System.out.println("Begin Fetch");
String[] fetchParameter = new String[5];
fetchParameter[0] = batchId;
fetchParameter[1] = "-crawlId";
fetchParameter[2] = crawlId;
fetchParameter[3] = "-threads";
//執行緒數量 thread
fetchParameter[4] = "10";
getConf().set("fetcher.timelimit.mins",timeLimitFetch);
res = ToolRunner.run(getConf(),fetcher, fetchParameter);
print(res,"fetch");
/**
* 配置檔案中 已經在fetch過程中就使用parse 所以這個單獨的parse不用在重複呼叫
*/
System.out.println("parse begin");
String[] parseParameter = new String[3];
parseParameter[0] = batchId;
parseParameter[1] = "-crawlId";
parseParameter[2] = crawlId;
getConf().set("mapred.skip.attempts.to.start.skipping","2");
getConf().set("mapred.skip.map.max.skip.records","1");
res = ToolRunner.run(getConf(), parse,parseParameter);
if (res==0){
System.out.println("parse finish");
}else {
System.out.println("parse failed");
}
//updatedb with this batch
System.out.println("begin updatedb");
String[] updatedbParameter = new String[3];
updatedbParameter[0] = batchId;
updatedbParameter[1] = "-crawlId";
updatedbParameter[2] = crawlId;
res = ToolRunner.run(getConf(),dbUpdaterJob,updatedbParameter);
print(res,"updatedb");
if (StringUtil.isEmpty(solrUrl)){
System.out.println("Skipping indexing tasks: no SOLR url provided.");
}else {
System.out.println("begin Indexing");
getConf().set("solr.server.url",solrUrl);
String[] indexingParameter = new String[3];
indexingParameter[0] = "-all";
indexingParameter[1] = "-crawlId";
indexingParameter[2] = crawlId;
res = ToolRunner.run(getConf(), indexingJob, indexingParameter);
print(res,"indexing");
System.out.println("begin SOLR dedup");
String[] solrdedupParameter = new String[1];
solrdedupParameter[0] = solrUrl;
res = ToolRunner.run(getConf(),solrDeleteDuplicates , solrdedupParameter);
print(res,"solr Delete Duplicates");
}
}
return 0;
}
public static void print(int res,String name ){
if (res==0){
System.out.println(name+" finish");
}else if (res==1){
System.out.println(name+" finish but no more URLs to fetch now,Escaping loop");
}else {
System.out.println(name+" failed");
}
}
}
先啟動mongodb,然後直接直接執行crawl類即可。我的配置預設mongodb是配置在本地機器。
如果要單獨執行nutch的每個階段,如inject、generate、fetch等可以按下面的方法來配置。以inject為例,其他都類似。
在idea裡面 點選Edit Configurations..然後點選左上腳+號,選擇Application,配置執行的類和引數即可,如下圖所示: