Hive学习

数据来源:

http://grouplens.org/datasets/movielens/
http://files.grouplens.org/datasets/movielens/ml-100k-README.txt

创建数据库表结构

u_data

CREATE TABLE u_data (
userid INT,
movieid INT,
rating INT,
unixtime STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;

u_user

CREATE TABLE u_user (
userid INT,
age INT,
gender STRING,
occupation STRING,
zipcode STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS TEXTFILE;

将文件中的数据导入到表中

LOAD DATA LOCAL INPATH 'u.data' OVERWRITE INTO TABLE u_data;
LOAD DATA LOCAL INPATH 'u.user' OVERWRITE INTO TABLE u_user;

注:
这两条语句是在hive client中运行

查询总记录数 其实是执行mapreduce任务

SELECT COUNT(*) FROM u_data;

查询评分为5的电影 其实是执行mapreduce任务

SELECT COUNT(*) FROM u_data WHERE rating=5;

联合查询

select d.userid,movieid,rating,age,gender,occupation,zipcode from u_data d join u_user u on (d.userid=u.userid);

Ratings
UserID::MovieID::Rating::Timestamp

CREATE TABLE u_ratings (
userid INT,
movieid INT,
rating INT,
unixtime STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '::'
STORED AS TEXTFILE;
LOAD DATA LOCAL INPATH 'ratings.dat' OVERWRITE INTO TABLE u_ratings;

Tags
UserID::MovieID::Tag::Timestamp

CREATE TABLE u_tags (
userid INT,
movieid INT,
Tag STRING,
unixtime STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '::'
STORED AS TEXTFILE;
LOAD DATA LOCAL INPATH 'tags.dat' OVERWRITE INTO TABLE u_tags;

Movies
MovieID::Title::Genres

CREATE TABLE u_movies (
movieid INT,
title STRING,
genres STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '::'
STORED AS TEXTFILE;
LOAD DATA LOCAL INPATH 'movies.dat' OVERWRITE INTO TABLE u_movies;
select count(*) from u_ratings;
select count(*) from u_tags;
select count(*) from u_movies;

添加数据,并不会覆盖原有数据

LOAD DATA LOCAL INPATH 'ratings.dat' INTO TABLE u_ratings;