文档章节

Nutch2.3 bin/crawl、bin/nutch 脚本

VictorHu
 VictorHu
发布于 2015/11/06 12:49
字数 1707
阅读 137
收藏 0
点赞 0
评论 0

环境


Nutch版本:Nutch 2.3

内容


1.bin/crawl脚本

#!/bin/bash
# The Crawl command script : crawl <seedDir> <crawlId> <solrURL> <numberOfRounds>
#
# 下面这一段主要是判断bin/crawl命令的参数
#
# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND 
# INDEXING FOR EACH BATCH

SEEDDIR="$1"
CRAWL_ID="$2"
if [ "$#" -eq 3 ]; then
	LIMIT="$3"
elif [ "$#" -eq 4 ]; then
	 SOLRURL="$3"
 	LIMIT="$4"
else
	echo "Unknown # of arguments $#"
	echo "Usage: crawl <seedDir> <crawlID> [<solrUrl>] <numberOfRounds>"
	exit -1;
fi

if [ "$SEEDDIR" = "" ]; then
	echo "Missing seedDir : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
	exit -1;
fi

if [ "$CRAWL_ID" = "" ]; then
	echo "Missing crawlID : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
	exit -1;
fi

if [ "$SOLRURL" = "" ]; then
	echo "No SOLRURL specified. Skipping indexing."
fi

if [ "$LIMIT" = "" ]; then
	echo "Missing numberOfRounds : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
	exit -1;
fi

#下面的这段是可以根据实际环境需要进行配置的
#
#############################################
# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
#############################################

# set the number of slaves nodes 设置Hadoop集群环境中Slave节点的数量
numSlaves=1

# and the total number of available tasks 分布计算的任务数
# sets Hadoop parameter "mapred.reduce.tasks"
numTasks=`expr $numSlaves \* 2`

# number of urls to fetch in one iteration
# 250K per task?
sizeFetchlist=`expr $numSlaves \* 50000`

# time limit for feching
timeLimitFetch=180

# Adds <days> to the current time to facilitate 
# crawling urls already fetched sooner then 按天设置爬取间隔 
# db.default.fetch.interval.
addDays=0
#############################################

bin="`dirname "$0"`"
bin="`cd "$bin"; pwd`"

# 根据是否存在job文件来判断是分布式或者本地运行
# determines whether mode based on presence of job file
mode=local
if [ -f "${bin}"/../*nutch*.job ]; then
	mode=distributed
fi

# Hadoop的一些参数
# note that some of the options listed here could be set in the 
# corresponding hadoop site xml param file 
commonOptions="-D mapred.reduce.tasks=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapred.reduce.tasks.speculative.execution=false -D mapred.map.tasks.speculative.execution=false -D mapred.compress.map.output=true"

# 检查是否设置Hadoop环境变量
# check that hadoop can be found on the path 
if [ $mode = "distributed" ]; then
 if [ $(which hadoop | wc -l ) -eq 0 ]; then
	echo "Can't find Hadoop executable. Add HADOOP_HOME/bin to the path or run in local mode."
	exit -1;
 fi
fi


function __bin_nutch {
	# run $bin/nutch, exit if exit value indicates error

	echo "$bin/nutch $@" ;# echo command and arguments
	"$bin/nutch" "$@"

	RETCODE=$?
	if [ $RETCODE -ne 0 ]
	then
    	echo "Error running:"
    	echo "  $bin/nutch $@"
    	echo "Failed with exit value $RETCODE."
    	exit $RETCODE
	fi
}



# initial injection
echo "Injecting seed URLs"
__bin_nutch inject "$SEEDDIR" -crawlId "$CRAWL_ID"

# 根据LIMIT参数设置循环次数
# main loop : rounds of generate - fetch - parse - update
for ((a=1; a <= LIMIT ; a++))
do
  if [ -e ".STOP" ]
  then
   echo "STOP file found - escaping loop"
   break
  fi

  echo `date` ": Iteration $a of $LIMIT"

  echo "Generating batchId"
  batchId=`date +%s`-$RANDOM

	# 执行Generat操作
  echo "Generating a new fetchlist"
  generate_args=($commonOptions -topN $sizeFetchlist -noNorm -noFilter -adddays $addDays -crawlId "$CRAWL_ID" -batchId $batchId)
  echo "$bin/nutch generate ${generate_args[@]}"
  $bin/nutch generate "${generate_args[@]}"
  RETCODE=$?
  if [ $RETCODE -eq 0 ]; then
      : # ok: no error
  elif [ $RETCODE -eq 1 ]; then
    echo "Generate returned 1 (no new segments created)"
    echo "Escaping loop: no more URLs to fetch now"
    break
  else
    echo "Error running:"
    echo "  $bin/nutch generate ${generate_args[@]}"
    echo "Failed with exit value $RETCODE."
    exit $RETCODE
  fi

	# 执行Fetch操作
  echo "Fetching : "
  __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch $batchId -crawlId "$CRAWL_ID" -threads 50

	# 执行Parse操作
  # parsing the batch
  echo "Parsing : "
  # enable the skipping of records for the parsing so that a dodgy document 
  # so that it does not fail the full task
  skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1"
  __bin_nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId "$CRAWL_ID"

	# 执行updatedb操作
  # updatedb with this batch
  echo "CrawlDB update for $CRAWL_ID"
  __bin_nutch updatedb $commonOptions $batchId -crawlId "$CRAWL_ID"

	
  if [ -n "$SOLRURL" ]; then
	# 执行index操作
    echo "Indexing $CRAWL_ID on SOLR index -> $SOLRURL"
    __bin_nutch index $commonOptions -D solr.server.url=$SOLRURL -all -crawlId "$CRAWL_ID"
	
	# 执行solrdedup操作
    echo "SOLR dedup -> $SOLRURL"
    __bin_nutch solrdedup $commonOptions $SOLRURL
  else
      echo "Skipping indexing tasks: no SOLR url provided."
  fi

done

exit 0

2.bin/crawl脚本

    #!/bin/bash

#
# The Nutch command script
#
# Environment Variables 环境变量
#
#   NUTCH_JAVA_HOME The java implementation to use.  Overrides JAVA_HOME.
#
#   NUTCH_HEAPSIZE  The maximum amount of heap to use, in MB. 
#                   Default is 1000.
#
#   NUTCH_OPTS      Extra Java runtime options.
#                   Multiple options must be separated by white space.
#
#   NUTCH_LOG_DIR   Log directory (default: $NUTCH_HOME/logs)
#
#   NUTCH_LOGFILE   Log file (default: hadoop.log)
#
#   NUTCH_CONF_DIR  Path(s) to configuration files (default: $NUTCH_HOME/conf).
#                   Multiple paths must be separated by a colon ':'.
# 
# cygwin是windows下的Linux运行环境
cygwin=false
case "`uname`" in
CYGWIN*) cygwin=true;;
esac

# resolve links - $0 may be a softlink
THIS="$0"
while [ -h "$THIS" ]; do
  ls=`ls -ld "$THIS"`
  link=`expr "$ls" : '.*-> \(.*\)$'`
  if expr "$link" : '.*/.*' > /dev/null; then
    THIS="$link"
  else
    THIS=`dirname "$THIS"`/"$link"
  fi
done

# nutch命令参数信息
# if no args specified, show usage
if [ $# = 0 ]; then
  echo "Usage: nutch COMMAND"
  echo "where COMMAND is one of:"
  echo " inject		inject new urls into the database"
  echo " hostinject     creates or updates an existing host table from a text file"
  echo " generate 	generate new batches to fetch from crawl db"
  echo " fetch 		fetch URLs marked during generate"
  echo " parse 		parse URLs marked during fetch"
  echo " updatedb 	update web table after parsing"
  echo " updatehostdb   update host table after parsing"
  echo " readdb 	read/dump records from page database"
  echo " readhostdb     display entries from the hostDB"
  echo " index          run the plugin-based indexer on parsed batches"
  echo " elasticindex   run the elasticsearch indexer - DEPRECATED use the index command instead"
  echo " solrindex 	run the solr indexer on parsed batches - DEPRECATED use the index command instead"
  echo " solrdedup 	remove duplicates from solr"
  echo " solrclean      remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead"
  echo " clean          remove HTTP 301 and 404 documents and duplicates from indexing backends configured via plugins"
  echo " parsechecker   check the parser for a given url"
  echo " indexchecker   check the indexing filters for a given url"
  echo " plugin 	load a plugin and run one of its classes main()"
  echo " nutchserver    run a (local) Nutch server on a user defined port"
  echo " webapp         run a local Nutch web application"
  echo " junit         	runs the given JUnit test"
  echo " or"
  echo " CLASSNAME 	run the class named CLASSNAME"
  echo "Most commands print help when invoked w/o parameters."
  exit 1
fi

# get arguments
COMMAND=$1
shift

# 根据当前目录设置NUTCH_HOME
# some directories
THIS_DIR="`dirname "$THIS"`"
NUTCH_HOME="`cd "$THIS_DIR/.." ; pwd`"

# some Java parameters
if [ "$NUTCH_JAVA_HOME" != "" ]; then
  #echo "run java in $NUTCH_JAVA_HOME"
  JAVA_HOME="$NUTCH_JAVA_HOME"
fi

if [ "$JAVA_HOME" = "" ]; then
  echo "Error: JAVA_HOME is not set."
  exit 1
fi


# NUTCH_JOB 
if [ -f "${NUTCH_HOME}"/*nutch*.job ]; then
  local=false
  for f in "$NUTCH_HOME"/*nutch*.job; do
    NUTCH_JOB="$f";
  done
  # cygwin path translation
  if $cygwin; then
    NUTCH_JOB="`cygpath -p -w "$NUTCH_JOB"`"
  fi
else
  local=true
fi

JAVA="$JAVA_HOME/bin/java"
JAVA_HEAP_MAX=-Xmx1000m 

# check envvars which might override default args
if [ "$NUTCH_HEAPSIZE" != "" ]; then
  #echo "run with heapsize $NUTCH_HEAPSIZE"
  JAVA_HEAP_MAX="-Xmx""$NUTCH_HEAPSIZE""m"
  #echo $JAVA_HEAP_MAX
fi

# 把NUTCH_HOME添加到CLASSPATH
# CLASSPATH initially contains $NUTCH_CONF_DIR, or defaults to $NUTCH_HOME/conf
CLASSPATH="${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}"
CLASSPATH="${CLASSPATH}:$JAVA_HOME/lib/tools.jar"

# so that filenames w/ spaces are handled correctly in loops below
IFS=

# add libs to CLASSPATH
if $local; then
  for f in "$NUTCH_HOME"/lib/*.jar; do
   CLASSPATH="${CLASSPATH}:$f";
  done
  # local runtime
  # add plugins to classpath
  if [ -d "$NUTCH_HOME/plugins" ]; then
 CLASSPATH="${NUTCH_HOME}:${CLASSPATH}"
  fi
fi

# cygwin path translation
if $cygwin; then
  CLASSPATH="`cygpath -p -w "$CLASSPATH"`"
fi

# setup 'java.library.path' for native-hadoop code if necessary
# used only in local mode 
JAVA_LIBRARY_PATH=''
if [ -d "${NUTCH_HOME}/lib/native" ]; then

  JAVA_PLATFORM=`"${JAVA}" -classpath "$CLASSPATH" org.apache.hadoop.util.PlatformName | sed -e 's/ /_/g'`

  if [ -d "${NUTCH_HOME}/lib/native" ]; then
    if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
      JAVA_LIBRARY_PATH="${JAVA_LIBRARY_PATH}:${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
    else
      JAVA_LIBRARY_PATH="${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
    fi
  fi
fi

if [ $cygwin = true -a "X${JAVA_LIBRARY_PATH}" != "X" ]; then
  JAVA_LIBRARY_PATH="`cygpath -p -w "$JAVA_LIBRARY_PATH"`"
fi

# restore ordinary behaviour
unset IFS

# default log directory & file
if [ "$NUTCH_LOG_DIR" = "" ]; then
  NUTCH_LOG_DIR="$NUTCH_HOME/logs"
fi
if [ "$NUTCH_LOGFILE" = "" ]; then
  NUTCH_LOGFILE='hadoop.log'
fi

#Fix log path under cygwin
if $cygwin; then
  NUTCH_LOG_DIR="`cygpath -p -w "$NUTCH_LOG_DIR"`"
fi

NUTCH_OPTS=($NUTCH_OPTS -Dhadoop.log.dir="$NUTCH_LOG_DIR")
NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Dhadoop.log.file="$NUTCH_LOGFILE")

if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
  NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Djava.library.path="$JAVA_LIBRARY_PATH")
fi

# 根据不同的参数调用不同的类
# figure out which class to run
if [ "$COMMAND" = "crawl" ] ; then
  echo "Command $COMMAND is deprecated, please use bin/crawl instead"
  exit -1
elif [ "$COMMAND" = "inject" ] ; then
CLASS=org.apache.nutch.crawl.InjectorJob
elif [ "$COMMAND" = "hostinject" ] ; then
CLASS=org.apache.nutch.host.HostInjectorJob
elif [ "$COMMAND" = "generate" ] ; then
CLASS=org.apache.nutch.crawl.GeneratorJob
elif [ "$COMMAND" = "fetch" ] ; then
CLASS=org.apache.nutch.fetcher.FetcherJob
elif [ "$COMMAND" = "parse" ] ; then
CLASS=org.apache.nutch.parse.ParserJob
elif [ "$COMMAND" = "updatedb" ] ; then
CLASS=org.apache.nutch.crawl.DbUpdaterJob
elif [ "$COMMAND" = "updatehostdb" ] ; then
CLASS=org.apache.nutch.host.HostDbUpdateJob
elif [ "$COMMAND" = "readdb" ] ; then
CLASS=org.apache.nutch.crawl.WebTableReader
elif [ "$COMMAND" = "readhostdb" ] ; then
CLASS=org.apache.nutch.host.HostDbReader
elif [ "$COMMAND" = "elasticindex" ] ; then
CLASS=org.apache.nutch.indexer.elastic.ElasticIndexerJob
elif [ "$COMMAND" = "solrindex" ] ; then
CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"
shift
elif [ "$COMMAND" = "index" ] ; then
CLASS=org.apache.nutch.indexer.IndexingJob
elif [ "$COMMAND" = "solrdedup" ] ; then
CLASS=org.apache.nutch.indexer.solr.SolrDeleteDuplicates
elif [ "$COMMAND" = "solrclean" ] ; then
  CLASS="org.apache.nutch.indexer.CleaningJob -D solr.server.url=$2 $1"
  shift; shift
elif [ "$COMMAND" = "clean" ] ; then
  CLASS=org.apache.nutch.indexer.CleaningJob
elif [ "$COMMAND" = "parsechecker" ] ; then
  CLASS=org.apache.nutch.parse.ParserChecker
elif [ "$COMMAND" = "indexchecker" ] ; then
  CLASS=org.apache.nutch.indexer.IndexingFiltersChecker
elif [ "$COMMAND" = "plugin" ] ; then
CLASS=org.apache.nutch.plugin.PluginRepository
elif [ "$COMMAND" = "webapp" ] ; then
CLASS=org.apache.nutch.webui.NutchUiServer
elif [ "$COMMAND" = "nutchserver" ] ; then
CLASS=org.apache.nutch.api.NutchServer
elif [ "$COMMAND" = "junit" ] ; then
  CLASSPATH="$CLASSPATH:$NUTCH_HOME/test/classes/"
  CLASS=org.junit.runner.JUnitCore
else
CLASS=$COMMAND
fi

# 判断是否分布式运行	
if $local; then
 # fix for the external Xerces lib issue with SAXParserFactory
 NUTCH_OPTS=(-Djavax.xml.parsers.DocumentBuilderFactory=com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl "${NUTCH_OPTS[@]}")
 EXEC_CALL=("$JAVA" $JAVA_HEAP_MAX "${NUTCH_OPTS[@]}" -classpath "$CLASSPATH")
else
 # check that hadoop can be found on the path
 if [ $(which hadoop | wc -l ) -eq 0 ]; then
    echo "Can't find Hadoop executable. Add HADOOP_HOME/bin to the path or run in local mode."
    exit -1;
 fi
 # distributed mode
 EXEC_CALL=(hadoop jar "$NUTCH_JOB")
fi

# 根据前面设置好的参数执行命令
# run it
exec "${EXEC_CALL[@]}" $CLASS "$@"

© 著作权归作者所有

共有 人打赏支持
VictorHu
粉丝 1
博文 12
码字总数 14462
作品 0
闵行
将nutch2.3的bin/crawl脚本改写为java类

将nutch2.3的bin/crawl脚本改写为java类 标签: nutch [TOC] nutch1.8以后,以前的主控代码类没了,只剩下对应的控制脚本,感觉在IDEA里面调试不方便,所以我了解了下shell脚本,根据nutch2....

brianway
2016/01/19
1K
0
Nutch2.3 + hbase0.98.8 +hadoop2.5.2

@杨尚川 你好,想跟你请教个问题:杨老师 实在不好意思,想麻烦你一个问题,困扰了我很久。 我并不是一个智慧伸手要资源的傻瓜,在学习了你相关的nutch视频之后,试着自己去尝试做一些东西。...

腰间两把刀
2015/05/24
4.2K
9
nutch2.3爬虫抓取电影网站

上一篇文章介绍了nutch的安装 该文会简单的抓取网站 http://www.6vhao.com 1,打开目录nutch-2.3/runtime/local 2,mkdir urls nano urls/url:添加链接 http://www.6vhao.com保存退出 3,在l...

Kadima
2015/10/29
0
0
nutch 安装部署 以nutch2.3.1 为例

最近在研究nutch 在Ubuntu系统上安装部署nutch,nutch安装步骤如下:1、先 安装ANT (ant下载安装包就不写了 百度一大把)//解压 antsudo tar -zxvf /usr/test/soft/apache-ant-1.9.7-bin.tar....

逝去的过去
2016/07/18
134
0
《Nutch笔记》Nutch-1.7+solr-4.7集成

一、下载安装nutch 下载地址 http://apache.fayea.com/apache-mirror/nutch/1.7/apache-nutch-1.7-bin.tar.gz 安装 [root@centos data]# cd /data/[root@centos data]# mkdir nutch[root@ce......

燃點
2014/03/21
0
0
nutch搜索引擎的搭建以及配置

最近公司需要搭建一个搜索引擎,于是就发现了apache旗下的这个nutch,也看了不少的文章,就在本地搭建了一个进行测试,发现局域网抓取还是比较好的,但是在互联网抓取还是有点问题,像百度、谷歌这...

dh_
2014/04/06
0
0
nutch搜索引擎的搭建以及配置

最近公司需要搭建一个搜索引擎,于是就发现了apache旗下的这个nutch,也看了不少的文章,就在本地搭建了一个进行测试,发现局域网抓取还是比较好的,但是在互联网抓取还是有点问题,像百度、谷歌这...

wiliiwin
2010/08/12
0
0
nutch2 crawl 命令分解,抓取网页的详细过程

首先,何以见得crawl是inject,generate,fetch,parse,update的集成呢(命令的具体含义及功能会在后续文章中说明),我们打开NUTCH_HOME/runtime/local/bin/crawl 我将主要代码黏贴下来 # initia...

Kadima
2015/10/30
0
0
nutch2.3.1、solr4.10.4、hadoop2.5.2、hbase0.98.19集成

nutch 1.nutch的相关介绍 1.1 什么是nutch? Nutch 是一个开源Java实现的搜索引擎。它提供了我们运行自己的搜索引擎所需的全部工具。包括全文搜索和Web爬虫。Nutch为我们提供了这样一个不同的...

刷新承诺
2016/05/27
477
0
nutch 抓取的内容如何存 hbase中

用的是nutch2.0+ mysql+solr能够做数据的搜集 urls/下存的是记录抓取网页信息 bin/nutch crawl urls -depth 3 -topN 5 可以把抓取的内容存储数据库。 bin/nutch solrindex http://127.0.0.1...

tngou
2012/12/11
1K
0

没有更多内容

加载失败,请刷新页面

加载更多

下一页

TensorFlow 作用域与操作符的受限范围

variable_scope 影响变量和操作符 name_scope 只影响操作符 with tf.name_scope(""),使用空字符串将作用域返回到顶层 tf.variable_scope("") 相当于添加一个空层 import tensorflow as tf...

阿豪boy
6分钟前
0
0
Java面试基础篇——第六篇:常见Map类的区别

常见的map类有: HashMap, ConcurrentHashMap (Jdk1.8) , LinkedHashMap, TreeMap, Hashtable。 其中我们最常用的莫过于HashMap, 和并发情况下使用的ConcurrentHashMap了,它们的主要区别就在...

developlee的潇洒人生
7分钟前
0
0
崛起于Springboot2.X之前端模版freemaker(23)

1、配置文件 spring: freemarker: allow-request-override: false cache: true check-template-location: true charset: UTF-8 content-type: text/html ......

木九天
24分钟前
1
0
spring-boot:run启动时,指定spring.profiles.active

Maven启动指定Profile通过-P,如mvn spring-boot:run -Ptest,但这是Maven的Profile。 如果要指定spring-boot的spring.profiles.active,则必须使用mvn spring-boot:run -Drun.profiles=test......

夜黑人模糊灬
26分钟前
0
0
大数据分析挖掘技术学习:Python文本分类

引言 文本分类作为自然语言处理任务之一,被广泛应用于解决各种商业领域的问题。文本分类的目的是将 文本/文档 自动地归类为一种或多种预定义的类别。常见的文本分类应用如下: • 理解社交媒...

加米谷大数据
30分钟前
0
0
istio-0.8 指标监控,prometheus,grafana

配置: https://istio.io/docs/tasks/telemetry/metrics-logs/ https://istio.io/docs/tasks/telemetry/tcp-metrics/ envoy拦截请求>上报mixer>对接prometheus>grafana 效果截图: promethe......

xiaomin0322
32分钟前
0
0
公众号推荐

阿里技术 书籍:《不止代码》

courtzjl
35分钟前
0
0
关于改进工作效率

1.给不同的业务线建立需求群,所有的数据需求都在群里面提。 2.对于特别难搞定的事情,到对应的技术哪去做,有问题随时沟通。 3.定期给工作总结形成方法论。 4.学习新的技术,尝试用新的方法...

Avner
42分钟前
0
0
关于thinkphp 框架开启路径重写,无法获取Authorization Header

今天遇到在thinkphp框架中获取不到header头里边的 Authorization ,后来在.htaccess里面加多一项解决,记录下: <IfModule mod_rewrite.c> Options +FollowSymlinks -Multiviews Rewrite......

殘留回憶
46分钟前
0
0
centos 使用yum安装nginx后如何添加模块 10

centos 使用yum安装nginx后如何添加模块 10 centos6.2版本,使用yum来安装了nginx,但是最近需要重新添加模块,所以就傻了,询问下有人知道怎么重新添加模块吗? PS:俺是新手,需要高手救助...

linjin200
49分钟前
1
0

没有更多内容

加载失败,请刷新页面

加载更多

下一页

返回顶部
顶部