上面说到了Spark如何与Hadoop整合,下面就说一下Spark如何与HBase整合。
1、获取hbase的classpath
#要把netty和jetty的包去掉,否则会有jar包冲突 HBASE_PATH=`/home/hadoop/Deploy/hbase-1.1.2/bin/hbase classpath`
2、启动spark
bin/spark-shell --driver-class-path $HBASE_PATH
3、进行简单的操作
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
val conf = HBaseConfiguration.create()
conf.set(TableInputFormat.INPUT_TABLE, "inpatient_hb")
val admin = new HBaseAdmin(conf)
admin.isTableAvailable("inpatient_hb")
res1: Boolean = true
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], classOf[org.apache.hadoop.hbase.client.Result])
hBaseRDD.count()
2017-01-03 20:46:29,854 INFO [main] scheduler.DAGScheduler (Logging.scala:logInfo(58)) - Job 0 finished: count at <console>:36, took 23.170739 s
res2: Long = 115077