Hive学习之Transform/Map-Reduce语法

时间:2020-12-19 18:23:48

        使用Hive的用户可以在数据流中插入自定义的Mappers和Reducers,为了运行自定义的Mapper和Reducer脚本,用户可以使用下面的命令,该命令使用TRANSFORM从句嵌入Mapper和Reducer脚本:

query:
FROM(
FROM src
MAPexpression (',' expression)*
(inRowFormat)?
USING 'my_map_script'
(AS colName (',' colName)* )?
(outRowFormat)?(outRecordReader)?
(clusterBy? | distributeBy? sortBy? ) src_alias
)
REDUCE expression (',' expression)*
(inRowFormat)?
USING 'my_reduce_script'
(AS colName (',' colName)* )?
(outRowFormat)?(outRecordReader)?

FROM(
FROM src
SELECT TRANSFORM '(' expression (',' expression)* ')'
(inRowFormat)?
USING 'my_map_script'
(AS colName (',' colName)* )?
(outRowFormat)?(outRecordReader)?
(clusterBy? | distributeBy? sortBy? ) src_alias
)
SELECT TRANSFORM '(' expression (',' expression)* ')'
(inRowFormat)?
USING 'my_reduce_script'
(AS colName (',' colName)* )?
(outRowFormat)?(outRecordReader)?

clusterBy:CLUSTER BY colName (',' colName)*
distributeBy:DISTRIBUTE BY colName (',' colName)*
sortBy: SORT BYcolName (ASC | DESC)? (',' colName (ASC | DESC)?)*

rowFormat
: ROW FORMAT
(DELIMITED[FIELDS TERMINATED BY char]
[COLLECTION ITEMS TERMINATED BY char]
[MAPKEYS TERMINATED BY char]
[ESCAPED BY char]
[LINES SEPARATED BY char]
| SERDE serde_name [WITH SERDEPROPERTIES
property_name=property_value,
property_name=property_value,...])
outRowFormat :rowFormat
inRowFormat :rowFormat
outRecordReader: RECORDREADER className

        默认情况下,列在传输给用户脚本之前会被转换为字符串并使用TAB分隔,NULL值会被转变为字符串常量\N以区分NULL值和空字符串。用户脚本的标准输出会被当做使用TAB分隔的字符串列,任何包含\N的单元会被重新解释为NULL,然后结果中的字符串列会转换为表中相应列的数据类型。这种默认分隔列的机制可以通过使用ROW FORMAT覆盖。在Windows下,使用“cmd/c your script”代替“your_script”。

       下面看一些例子:

FROM (
FROM pv_users
MAP pv_users.userid, pv_users.date
USING 'map_script'
AS dt, uid
CLUSTER BY dt) map_output
INSERT OVERWRITE TABLE pv_users_reduced
REDUCE map_output.dt, map_output.uid
USING 'reduce_script'
AS date, count;

FROM (
FROM pv_users
SELECT TRANSFORM(pv_users.userid,pv_users.date)
USING 'map_script'
AS dt, uid
CLUSTER BY dt) map_output
INSERT OVERWRITE TABLE pv_users_reduced
SELECT TRANSFORM(map_output.dt,map_output.uid)
USING 'reduce_script'
AS date, count;
//二者具有相同的效果,后者使用SELECT TRANSFORM代替了前者的MAP和REDUCE从句。

FROM (
FROM src
SELECT TRANSFORM(src.key, src.value) ROWFORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
USING '/bin/cat'
AS (tkey, tvalue) ROW FORMAT SERDE'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
RECORDREADER'org.apache.hadoop.hive.ql.exec.TypedBytesRecordReader'
) tmap
INSERT OVERWRITE TABLE dest1 SELECT tkey, tvalue

       如果在USING my_script之后没有AS从句,Hive假设脚本的输出包含两部分:第一个TAB之前部分为key,第一个TAB之后的剩余部分为value。这和指定AS key,value是由区别的,因为在该情况下value仅包含第一个TAB和第二个TAB之间的部分。也可以在未指定输出模式的情况下直接使用CLUSTER BY key从句:

FROM (
FROM pv_users
MAP pv_users.userid, pv_users.date
USING 'map_script'
CLUSTER BY key) map_output
INSERT OVERWRITE TABLE pv_users_reduced
REDUCE map_output.key, map_output.value
USING 'reduce_script'
AS date, count;

       如上面提到过的,默认情况下脚本的输出字段被当做字符串类型,可以立即对输出字段使用类型转换:

SELECTTRANSFORM(stuff)
USING 'script'
AS (thing1 INT,thing2 INT)