hive 使用脚本清洗数据:时间戳转日期

时间:2023-03-08 18:48:37
hive 使用脚本清洗数据:时间戳转日期
import sys
import datetime for line in sys.stdin:
line = line.strip()
userid, movieid, rating, unixtime = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([userid, movieid, rating, str(weekday)])

Use the mapper script:

CREATE TABLE u_data_new (
userid INT,
movieid INT,
rating INT,
weekday INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'; add FILE weekday_mapper.py; INSERT OVERWRITE TABLE u_data_new
SELECT
TRANSFORM (userid, movieid, rating, unixtime)
USING 'python weekday_mapper.py'
AS (userid, movieid, rating, weekday)
FROM u_data; SELECT weekday, COUNT(*)
FROM u_data_new
GROUP BY weekday;
  1. FROM (
  2. MAP doctext USING 'python wc_mapper.py' AS (word, cnt)
  3. FROM docs
  4. CLUSTER BY word
  5. ) a
  6. REDUCE word, cnt USING 'python wc_reduce.py';