Hortonworks 用于做 Sentimental Analysis的Hiveddl.sql 文件

The hiveddl.sql script has performed the following steps to refine the data:

Converted the raw Twitter data into a tabular format.
Used the dictionary file to score the sentiment of each Tweet by the number of positive words compared to the number of negative words, and then assigned a positive, negative, or neutral sentiment value to each Tweet.
Created a new table that includes the sentiment value for each Tweet.

http://hortonworks.com/hadoop-tutorial/how-to-refine-and-visualize-sentiment-data/

ADD JAR json-serde-1.1.6-SNAPSHOT-jar-with-dependencies.jar;

--create the tweets_raw table containing the records as received from Twitter

CREATE EXTERNAL TABLE tweets_raw (

   id BIGINT,

   created_at STRING,

   source STRING,

   favorited BOOLEAN,

   retweet_count INT,

   retweeted_status STRUCT<

      text:STRING,

      user:STRUCT<screen_name:STRING,name:STRING>>,

   entities STRUCT<

      urls:ARRAY<STRUCT<expanded_url:STRING>>,

      user_mentions:ARRAY<STRUCT<screen_name:STRING,name:STRING>>,

      hashtags:ARRAY<STRUCT<text:STRING>>>,

   text STRING,

   user STRUCT<

      screen_name:STRING,

      name:STRING,

      friends_count:INT,

      followers_count:INT,

      statuses_count:INT,

      verified:BOOLEAN,

      utc_offset:STRING, -- was INT but nulls are strings

      time_zone:STRING>,

   in_reply_to_screen_name STRING,

   year int,

   month int,

   day int,

   hour int

)

ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'

LOCATION '/user/hue/upload/upload/data/tweets_raw'

;

-- create sentiment dictionary

CREATE EXTERNAL TABLE dictionary (

    type string,

    length int,

    word string,

    pos string,

    stemmed string,

    polarity string

)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'

STORED AS TEXTFILE

LOCATION '/user/hue/upload/upload/data/dictionary';

CREATE EXTERNAL TABLE time_zone_map (

    time_zone string,

    country string,

    notes string

)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'

STORED AS TEXTFILE

LOCATION '/user/hue/upload/upload/data/time_zone_map';

-- Clean up tweets

CREATE VIEW tweets_simple AS

SELECT

  id,

  cast ( from_unixtime( unix_timestamp(concat( '2013 ', substring(created_at,5,15)), 'yyyy MMM dd hh:mm:ss')) as timestamp) ts,

  text,

  user.time_zone

FROM tweets_raw

;

CREATE VIEW tweets_clean AS

SELECT

  id,

  ts,

  text,

  m.country

 FROM tweets_simple t LEFT OUTER JOIN time_zone_map m ON t.time_zone = m.time_zone;

-- Compute sentiment

create view l1 as select id, words from tweets_raw lateral view explode(sentences(lower(text))) dummy as words;

create view l2 as select id, word from l1 lateral view explode( words ) dummy as word ;

-- was: select * from l2 left outer join dict d on l2.word = d.word where polarity = 'negative' limit 10;

create view l3 as select

    id,

    l2.word,

    case d.polarity

      when  'negative' then -1

      when 'positive' then 1

      else 0 end as polarity

 from l2 left outer join dictionary d on l2.word = d.word;

 create table tweets_sentiment stored as orc as select

  id,

  case

    when sum( polarity ) > 0 then 'positive'

    when sum( polarity ) < 0 then 'negative'

    else 'neutral' end as sentiment

 from l3 group by id;

-- put everything back together and re-number sentiment

CREATE TABLE tweetsbi

STORED AS ORC

AS

SELECT

  t.*,

  case s.sentiment

    when 'positive' then 2

    when 'neutral' then 1

    when 'negative' then 0

  end as sentiment

FROM tweets_clean t LEFT OUTER JOIN tweets_sentiment s on t.id = s.id;

-- for Tableau or Excel

-- UDAF sentiscore = sum(sentiment)*50  / count(sentiment)

-- context n-gram made readable

CREATE TABLE twitter_3grams

STORED AS RCFilese

AS

SELECT year, month, day, hour, snippet

FROM

( SELECT

    year,

    month,

     day,

     hour,

     context_ngrams(sentences(lower(text)), array("iron","man","3",null,null,null), 10) ngs

  FROM tweets group by year,month,day, hour

) base

 LATERAL VIEW

     explode(  ngs  ) ngsTab AS snippet -- ngsTab is random alias => must be there even though not used

;