Hive案例分析之消费数据

时间:2024-03-08 13:17:45

Hive案例分析之消费数据

部分数据展示

1.customer_details

customer_id,first_name,last_name,email,gender,address,country,language,job,credit_type,credit_no
1,Spencer,Raffeorty,sraffeorty0@dropbox.com,Male,9274 Lyons Court,China,Khmer Safety,Technician III,jcb,3589373385487669
2,Cherye,Poynor,cpoynor1@51.la,Female,1377 Anzinger Avenue,China,Czech,Research Nurse,instapayment,6376594861844533
3,Natasha,Abendroth,nabendroth2@scribd.com,Female,2913 Evergreen Lane,China,Yiddish,Budget/Accounting Analyst IV,visa,4041591905616356
4,Huntley,Seally,hseally3@prlog.org,Male,694 Del Sol Lane,China,Albanian,Environmental Specialist,laser,677118310740263477
5,Druci,Coad,dcoad4@weibo.com,Female,16 Debs Way,China,Hebrew,Teacher,jcb,3537287259845047
6,Sayer,Brizell,sbrizell5@opensource.org,Male,71 Banding Terrace,China,Maltese,Accountant IV,americanexpress,379709885387687
7,Becca,Brawley,bbrawley6@sitemeter.com,Female,7 Doe Crossing Junction,China,Czech,Payment Adjustment Coordinator,jcb,3545377719922245
8,Michele,Bastable,mbastable7@sun.com,Female,98 Clyde Gallagher Pass,China,Malayalam,Tax Accountant,jcb,3588131787131504
9,Marla,Brotherhood,mbrotherhood8@illinois.edu,Female,4538 Fair Oaks Trail,China,Dari,Design Engineer,china-unionpay,5602233845197745479
10,Lionello,Gogarty,lgogarty9@histats.com,Male,800 Sage Alley,China,Danish,Clinical Specialist,diners-club-carte-blanche,30290846607043

2.store_details

1,NoFrill,10
2,Lablaws,23
3,FoodMart,18
4,FoodLovers,26
5,Walmart,30

3.store_review

7430,1,5
912,3,3
4203,5,3
2205,4,4
5166,5,5
2755,5,
2036,5,5
5712,1,2
5296,5,4
6964,4,2

4.transaction_details

transaction_id,customer_id,store_id,price,product,date,time
1,225,5,47.02,Bamboo Shoots - Sliced,2017-08-04,8:18
2,290,5,43.12,Tarts Assorted,2017-09-23,14:41
3,300,1,27.01,Soup - Campbells, Minestrone,2017-07-17,11:53
4,191,2,8.08,Hold Up Tool Storage Rack,2017-12-02,20:08
5,158,4,14.03,Pail With Metal Handle 16l White,2017-12-14,6:47
6,66,4,19.33,Flour - Bread,2017-06-25,18:48
7,440,2,7.41,Rice - Jasmine Sented,2017-04-10,18:29
8,419,2,9.28,Soup - Campbells Bean Medley,2018-02-08,17:03
9,351,3,14.07,Butter - Salted,2017-07-01,2:07
10,455,5,8.31,Trout - Smoked,2018-02-20,0:53

一、创建数据库

spark-sql>create database shopping;

二、创建表

客户细节表

spark-sql>create  table ext_customer_details(
customer_id string,
first_name string,
last_name string,
email string,
gender string,
address string,
country string,
language string,
job string,
credit_type string,
credit_no string)
row format delimited fields terminated by ',' tblproperties ("skip.header.line.count"="1");

spark-sql>load data local inpath '/usr/local/src/datas/consume/cu.txt' into table ext_customer_details;

交易表

spark-sql>create table ext_transaction_details (
transaction_id string,
customer_id string,
store_id string,
price string,
product string,
purchase_date string,
purchase_time string
)row format delimited fields terminated by ',' tblproperties ("skip.header.line.count"="1");

spark-sql>load data local inpath '/usr/local/src/datas/consume/transaction_details.txt' into table ext_transaction_details;

商店表

spark-sql>create  table  ext_store_details (
store_id string,
store_name string,
employee_number int
)row format delimited fields terminated by ',' tblproperties ("skip.header.line.count"="1");

spark-sql>load data local inpath '/usr/local/src/datas/consume/store_details.txt' into table ext_store_details;

商店回顾表

spark-sql>create  table  ext_store_review (
transaction_id string,
store_id string,
review_score int
)row format delimited fields terminated by ',' tblproperties ("skip.header.line.count"="1");

spark-sql>load data local inpath '/usr/local/src/datas/consume/store_review.txt' into table ext_store_review;

个人信息进行加密

create view if not exists vw_customer_details as
select
customer_id,
first_name,
unbase64(last_name) as last_name,
unbase64(email) as email,
gender,
unbase64(address) as address,
country,
job,
credit_type,
unbase64(concat(unbase64(credit_no), 'seed')) as credit_no --better way than hash since it can be decoded base64(regexp_replace(base64(credit_no), unbase64('seed'), ''))
from ext_customer_details

ext_transaction_details中有重复的transaction_id

先建立一个transaction_details表

create table if not exists transaction_details (
transaction_id string,
customer_id string,
store_id string,
price string,
product string,
purchase_time string,
purchase_date date
)
partitioned by (purchase_month string)

设置临时结果集base:查找出相同的transaction_id,相同的transaction_id会产生不同的行number

with base as (
select row_number() over(partition by transaction_id order by 1) as rn,*
from ext_transaction_details
)
select count(*) from base where rn > 1

交通细节表

生成临时子集base,使用transaction_id分组

set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrick;


with base as (
select
transaction_id,
customer_id,
store_id,
price,
product,
purchase_time,
purchase_date,
--对时间格式进行转换
from_unixtime(unix_timestamp(purchase_date ,'yyyy-MM-dd'), 'yyyy-MM') as purchase_month,
--以transaction_id为基础排列row-number
row_number() over(partition by transaction_id order by store_id) as rn
from ext_transaction_details
where customer_id<>'customer_id'
)
from base
insert overwrite table transaction_details partition(purchase_month)
select
if(rn = 1, transaction_id, concat(transaction_id, '_fix', rn)) as transaction_id,
--rn = 1的时候输出transaction_id,当rn >= 2的时候通过concat(transaction_id, '_fix', rn)连接字符串
customer_id,
store_id,
price,
product,
purchase_time,
purchase_date,
purchase_month

商户回顾表

创建视图,取出不为空的部分

create view if not exists vw_store_review as
select transaction_id, review_score
from ext_store_review
where review_score <> ''

三、分析数据

以客户为中心

1. 找出顾客最常用的信用卡:

spark-sql>select credit_type,count(distinct credit_no) as count_cre from ext_customer_details group by credit_type order by count_cre desc limit 1;

jcb     14
Time taken: 0.553 seconds, Fetched 1 row(s)

2.找出客户数据中的前5个职位名称:

spark-sql>select job,count(*) as count_job from ext_customer_details group by job order by count_job desc limit 5;

Environmental Specialist        2
Assistant Professor     1
Analyst Programmer      1
Recruiting Manager      1
Accountant IV   1
Time taken: 0.275 seconds, Fetched 5 row(s)

3. 对于美国女性来说,她们手中最受欢迎的信用卡片是什么

spark-sql>select credit_type,count(distinct credit_no) as count_cre from ext_customer_details where  country = 'United States' and gender = 'Female' group by credit_type order by count_cre desc;

jcb     1
Time taken: 0.569 seconds, Fetched 1 row(s)
22/03/08 09:33:00 INFO CliDriver: Time taken: 0.569 seconds, Fetched 1 row(s)

4. 按性别和国家计算客户统计

spark-sql> select country,gender,count(*) from ext_customer_details group by country,gender;

China   Female  22
China   Male    6
country gender  1
United States   Male    1
United States   Female  1
Time taken: 0.358 seconds, Fetched 5 row(s)
22/03/08 09:45:54 INFO CliDriver: Time taken: 0.358 seconds, Fetched 5 row(s)

以交易为中心

1. 计算每月总收入

spark-sql>select purchase_month,round(sum(price),2)as sum_proice from transaction_details group by purchase_month order by sum_proice desc;

spark-sql>select DATE_FORMAT(purchase_date,'yyyy-MM'),round(sum(price),2)as sum_proice from ext_transaction_details group by DATE_FORMAT(purchase_date,'yyyy-MM') order by sum_proice desc;

2017-06 721.07
2018-03 690.26
2017-09 606.86
2017-05 587.72
2018-01 566.27
2017-10 564.49
2017-07 559.8
2017-04 543.02
2017-08 531.8
2018-02 486.14
2017-12 366.27
2017-11 334.58
Time taken: 0.935 seconds, Fetched 13 row(s)
22/03/08 10:04:29 INFO CliDriver: Time taken: 0.935 seconds, Fetched 13 row(s)

2. 按季度计算总收入

spark-sql>select round(sum(price),2)as revenue_price,year_quarter from (select price,concat_ws('-',substr(purchase_date,1,4),cast(ceil(month(purchase_date)/3.0)as string))as year_quarter from transaction_details) group by year_quarter order by revenue_price desc;
1851.81 2017-2 --2017年第二季度,即2017年4,5,6月总收入
1742.67 2018-1
1698.46 2017-3
1265.34 2017-4
Time taken: 1.138 seconds, Fetched 4 row(s)
22/03/08 15:31:10 INFO CliDriver: Time taken: 1.138 seconds, Fetched 4 row(s)

3. 按年计算总收入

spark-sql>select round(sum(price),2)as reven_price,year(purchase_date)as years from ext_transaction_details group by year(purchase_date) order by reven_price desc;

4815.61 2017
1742.67 2018
Time taken: 0.696 seconds, Fetched 2 row(s)
22/03/08 11:18:38 INFO CliDriver: Time taken: 0.696 seconds, Fetched 2 row(s)

4. 按工作日星期计算总收入

spark-sql>select round(sum(price),2)as total_rev,DATE_FORMAT(purchase_date,'u')as week from transaction_details group by DATE_FORMAT(purchase_date,'u') order by total_rev desc;

1153.9  2 --所有星期二的收入
1104.69 5
1026.67 3
969.47  7
894.33  4
741.44  1
667.78  6
Time taken: 0.933 seconds, Fetched 7 row(s)
22/03/08 15:50:15 INFO CliDriver: Time taken: 0.933 seconds, Fetched 7 row(s)

5. (需要清理数据)(按时间桶(上午、中午等)计算总收入/平均购买量)

--定义时间分桶

--early morning: (5:00, 8:00]

--morning: (8:00, 11:00]

--noon: (11:00, 13:00]

--afternoon: (13:00, 18:00]

--evening: (18:00, 22:00]

--night: (22:00, 5:00]

--因为它不是线性递增的,所以把它当作else

--我们也安排时间。第一个格式时间到19:23喜欢,然后比较,然后转换分钟到小时
-------------------------------------写法1-----------------------------------------------
spark-sql>select time_bucket, round(avg(price),2) as avg_spend, round(sum(price)/1000,2) as revenue_k 
from (
    select
price, purchase_time, purchase_time_in_hrs,
if(purchase_time_in_hrs > 5 and purchase_time_in_hrs <=8, 'early morning',
if(purchase_time_in_hrs > 8 and purchase_time_in_hrs <=11, 'morning',
if(purchase_time_in_hrs > 11 and purchase_time_in_hrs <=13, 'noon',
if(purchase_time_in_hrs > 13 and purchase_time_in_hrs <=18, 'afternoon',
if(purchase_time_in_hrs > 18 and purchase_time_in_hrs <=22, 'evening', 'night'))))) as time_bucket 
from (
    select
purchase_time, price, (cast(split(time_format, ':')[0] as decimal(4,2)) + cast(split(time_format, ':')[1] as decimal(4,2))/60) as purchase_time_in_hrs
from (
    select price, purchase_time, if(purchase_time like '%M',
from_unixtime(unix_timestamp(purchase_time,'hh:mm aa'),'HH:mm'), purchase_time) as time_format
from 
    transaction_details)))
group by time_bucket order by avg_spend desc;

-------------------------------------写法2-----------------------------------------------
spark-sql>with base as (
select price, purchase_time, if(purchase_time like '%M',
from_unixtime(unix_timestamp(purchase_time,'hh:mm aa'),'HH:mm'), purchase_time) as time_format
from transaction_details   ),
timeformat as (
select
purchase_time, price, (cast(split(time_format, ':')[0] as decimal(4,2)) + cast(split(time_format, ':')[1] as decimal(4,2))/60) as purchase_time_in_hrs
from base   ),
timebucket as (
select
price, purchase_time, purchase_time_in_hrs,
if(purchase_time_in_hrs > 5 and purchase_time_in_hrs <=8, 'early morning',
if(purchase_time_in_hrs > 8 and purchase_time_in_hrs <=11, 'morning',
if(purchase_time_in_hrs > 11 and purchase_time_in_hrs <=13, 'noon',
if(purchase_time_in_hrs > 13 and purchase_time_in_hrs <=18, 'afternoon',
if(purchase_time_in_hrs > 18 and purchase_time_in_hrs <=22, 'evening', 'night'))))) as time_bucket
from timeformat
)
select time_bucket, round(avg(price),2) as avg_spend, round(sum(price)/1000,2) as revenue_k from timebucket group by time_bucket order by avg_spend desc;

night   29.01   1.83
noon    28.23   0.45
early morning   27.83   1.03
evening 27.01   1.0
morning 25.58   1.0
afternoon       24.55   1.25
Time taken: 1.285 seconds, Fetched 6 row(s)
22/03/08 16:42:58 INFO CliDriver: Time taken: 1.285 seconds, Fetched 6 row(s)

6. 按工作日计算平均消费

spark-sql>select round(avg(price),2) as avg_price,DATE_FORMAT(purchase_date,'u')as week from ext_transaction_details where date_format(purchase_date, 'u') is not null group by DATE_FORMAT(purchase_date,'u') order by avg_price desc;
29.59   2
28.85   4
28.33   5
27.02   3
26.48   1
24.86   7
23.03   6
Time taken: 0.328 seconds, Fetched 7 row(s)
22/03/08 17:23:09 INFO CliDriver: Time taken: 0.328 seconds, Fetched 7 row(s)

7. 计算年、月、日交易总额

spark-sql>with base as (
select transaction_id, date_format(purchase_date, 'u') as weekday, purchase_month,
concat_ws('-', substr(purchase_date, 1, 4), cast(ceil(month(purchase_date)/3.0) as string)) as year_quarter, substr(purchase_date, 1, 4) as year
from transaction_details
where purchase_month is not null    )
select count(distinct transaction_id) as total, weekday, purchase_month, year_quarter, year
from base group by weekday, purchase_month, year_quarter, year order by year, purchase_month;

total   weekday purchase_month  year_quarter    year
1       2       2017-04 2017-2  2017  --2017年第二季度的4月下的所有需求二的订单和有2笔
7       5       2017-04 2017-2  2017
3       6       2017-04 2017-2  2017
4       1       2017-04 2017-2  2017
2       7       2017-04 2017-2  2017