Spark读写HBase

Spark读写HBase示例

1、HBase shell查看表结构

hbase(main)::> desc 'SDAS_Person'

Table SDAS_Person is ENABLED

SDAS_Person

COLUMN FAMILIES DESCRIPTION

{NAME => 'cf0', BLOOMFILTER => 'ROW', VERSIONS => '', IN_MEMORY => 'false', KEEP_DELETED_CELLS => 'FALSE',

 DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMPRESSION => 'NONE', MIN_VERSIONS => '', BLOCKCACHE =>

 'true', BLOCKSIZE => '', REPLICATION_SCOPE => ''}

{NAME => 'cf1', BLOOMFILTER => 'ROW', VERSIONS => '', IN_MEMORY => 'false', KEEP_DELETED_CELLS => 'FALSE',

 DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMPRESSION => 'NONE', MIN_VERSIONS => '', BLOCKCACHE =>

 'true', BLOCKSIZE => '', REPLICATION_SCOPE => ''}

{NAME => 'cf2', BLOOMFILTER => 'ROW', VERSIONS => '', IN_MEMORY => 'false', KEEP_DELETED_CELLS => 'FALSE',

 DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMPRESSION => 'NONE', MIN_VERSIONS => '', BLOCKCACHE =>

 'true', BLOCKSIZE => '', REPLICATION_SCOPE => ''}

 row(s) in 0.0810 seconds

hbase(main)::> desc 'RESULT'

Table RESULT is ENABLED

RESULT

COLUMN FAMILIES DESCRIPTION

{NAME => 'cf0', BLOOMFILTER => 'ROW', VERSIONS => '', IN_MEMORY => 'false', KEEP_DELETED_CELLS => 'FALSE',

DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMPRESSION => 'NONE', MIN_VERSIONS => '', BLOCKCACHE =>

'true', BLOCKSIZE => '', REPLICATION_SCOPE => ''}

 row(s) in 0.0250 seconds

2、HBase shell插入数据

hbase(main)::> scan 'SDAS_Person'

ROW                         COLUMN+CELL

 SDAS_1#                   column=cf0:Age, timestamp=, value=

 SDAS_1#                   column=cf0:CompanyID, timestamp=, value=

 SDAS_1#                   column=cf0:InDate, timestamp=, value=-- ::08.49

 SDAS_1#                   column=cf0:Money, timestamp=, value=5.20

 SDAS_1#                   column=cf0:Name, timestamp=, value=zhangsan

 SDAS_1#                   column=cf0:PersonID, timestamp=, value=

3、pom.xml：

    <dependency>

      <groupId>org.scala-lang</groupId>

      <artifactId>scala-library</artifactId>

      <version>${scala.version}</version>

    </dependency>

    <dependency>

      <groupId>org.apache.spark</groupId>

      <artifactId>spark-core_${scala.binary.version}</artifactId>

      <version>${spark.version}</version>

      <scope>provided</scope>

    </dependency>

4、源码：

package com.zxth.sdas.spark.apps

import org.apache.spark._

import org.apache.spark.rdd.NewHadoopRDD

import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}

import org.apache.hadoop.hbase.client.HBaseAdmin

import org.apache.hadoop.hbase.mapreduce.TableInputFormat

import org.apache.hadoop.hbase.util.Bytes

import org.apache.hadoop.hbase.client.Put

import org.apache.hadoop.hbase.io.ImmutableBytesWritable

import org.apache.hadoop.mapreduce.Job

import org.apache.hadoop.hbase.client.Result

import org.apache.hadoop.hbase.mapreduce.TableOutputFormat

object HBaseOp {

  var total:Int = 0

  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setAppName("HBaseOp").setMaster("local")

    val sc = new SparkContext(sparkConf)

    val conf = HBaseConfiguration.create()

    conf.set("hbase.zookeeper.quorum","master,slave1,slave2")

    conf.set("hbase.zookeeper.property.clientPort", "2181")

    conf.set(TableInputFormat.INPUT_TABLE, "SDAS_Person")

    //读取数据并转化成rdd

    val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],

      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],

      classOf[org.apache.hadoop.hbase.client.Result])  

    val count = hBaseRDD.count()

    println("\n\n\n:" + count)

    hBaseRDD.foreach{case (_,result) =>{

      //获取行键

      val key = Bytes.toString(result.getRow)

      //通过列族和列名获取列

      var obj = result.getValue("cf0".getBytes,"Name".getBytes)

      val name = if(obj==null) "" else Bytes.toString(obj)

      obj = result.getValue("cf0".getBytes,"Age".getBytes);

      val age:Int = if(obj == null) 0 else Bytes.toString(obj).toInt

      total = total + age

      println("Row key:"+key+" Name:"+name+" Age:"+age+" total:"+total)

    }}

    var average:Double = total.toDouble/count.toDouble

    println("" + total + "/" + count + " average age:" + average.toString())

    //write hbase

    conf.set(TableOutputFormat.OUTPUT_TABLE, "RESULT")

    val job = new Job(conf)

    job.setOutputKeyClass(classOf[ImmutableBytesWritable])

    job.setOutputValueClass(classOf[Result])

    job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])

    var arrResult:Array[String] = new Array[String](1)

    arrResult(0) = "1," + total + "," + average;

    //arrResult(0) = "1,100,11"

    val resultRDD = sc.makeRDD(arrResult)

    val saveRDD = resultRDD.map(_.split(',')).map{arr=>{

      val put = new Put(Bytes.toBytes(arr(0)))

      put.add(Bytes.toBytes("cf0"),Bytes.toBytes("total"),Bytes.toBytes(arr(1)))

      put.add(Bytes.toBytes("cf0"),Bytes.toBytes("average"),Bytes.toBytes(arr(2)))

      (new ImmutableBytesWritable, put)

    }}

    println("getConfiguration")

    var c = job.getConfiguration()

    println("save")

    saveRDD.saveAsNewAPIHadoopDataset(c)  

    sc.stop()

  }

}

5、maven打包

mvn clean scala:compile compile package

6、提交运算

bin/spark-submit \

--jars $(echo /opt/hbase-1.2./lib/*.jar | tr ' ' ',') \

--class com.zxth.sdas.spark.apps.HBaseOp \

--master local \

sdas-spark-1.0.0.jar

相关文章