/usr/apache_logs
access_2013_05_30.log
access_2013_05_31.log
日志格式:
27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/faq.gif HTTP/1.1" 200 1127
#bin/flume-ng agent -n a4 -f myagent/a4.conf -c conf -Dflume.root.logger=INFO,console
#定义agent名, source、channel、sink的名称
a4.sources = r1
a4.channels = c1
a4.sinks = k1
#具体定义source
a4.sources.r1.type = spooldir
a4.sources.r1.spoolDir = /usr/apache_logs
#具体定义channel
a4.channels.c1.type = memory
a4.channels.c1.capacity = 10000
a4.channels.c1.transactionCapacity = 100
#定义拦截器,为消息添加时间戳
a.4sources.r1.interceptors = i1
a4.sources.r1.interceptors.i1.type = org.apache.flume.interceptor.TimestampInterceptor$Builder
#具体定义sink
a4.sinks.k1.type = hdfs
a4.sinks.k1.hdfs.path = /techbbs
a4.sinks.k1.hdfs.filePrefix = events-
a4.sinks.k1.hdfs.fileType = DataStream
#不按照条数生成文件
a4.sinks.k1.hdfs.rollCount = 0
#HDFS上的文件达到128M时生成一个文件
a4.sinks.k1.hdfs.rollSize = 134217728
#HDFS上的文件达到60秒生成一个文件
a4.sinks.k1.hdfs.rollInterval = 60
#组装source、channel、sink
a4.sources.r1.channels = c1
a4.sinks.k1.channel = c1
hdfs dfs -put /usr/apache_logs /techbbs
hdfs中 /techbbs/access_2013_05_30.log
/techbbs/access_2013_05_31.log
hadoop jar logclean.jar demo.LogCleanJobMain /techbbs/access_2013_05_30.log /techbbs/cleaned/2013_05_30
hadoop jar logclean.jar demo.LogCleanJobMain /techbbs/access_2013_05_31.log /techbbs/cleaned/2013_05_31
/techbbs/cleaned/access_2013_05_30.log
/techbbs/cleaned/access_2013_05_31.log
(这里关键之处就在于确定映射的HDFS位置,我这里是/project/techbbs/cleaned即清洗后的数据存放的位置)
hive>CREATE EXTERNAL TABLE techbbs(ip string, atime string, url string) PARTITIONED BY (logdate string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '/techbbs/cleaned';
hive>ALTER TABLE techbbs ADD PARTITION(logdate='2013_05_30') LOCATION '/techbbs/cleaned/2013_05_30';
页面浏览量即为PV(Page View),是指所有用户浏览页面的总和,一个独立用户每打开一个页面就被记录1 次。 这里,我们只需要统计日志中的记录个数即可,HQL代码如下:
hive>CREATE TABLE techbbs_pv_2013_05_30 AS SELECT COUNT(1) AS PV FROM techbbs WHERE logdate='2013_05_30';
hive> select * from techbbs_pv_2013_05_30;
该论坛的用户注册页面为member.php,而当用户点击注册时请求的又是member.php?mod=register的url。 因此,这里我们只需要统计出日志中访问的URL是member.php?mod=register的即可,HQL代码如下:
hive>CREATE TABLE techbbs_reguser_2013_05_30 AS SELECT COUNT(1) AS REGUSER FROM techbbs WHERE logdate='2013_05_30' AND INSTR(url,'member.php?mod=register')>0;
hive> select * from techbbs_reguser_2013_05_30;
一天之内,访问网站的不同独立 IP 个数加和。其中同一IP无论访问了几个页面,独立IP 数均为1。 因此,这里我们只需要统计日志中处理的独立IP数即可,在SQL中我们可以通过DISTINCT关键字,在HQL中也是通过这个关键字:
hive>CREATE TABLE techbbs_ip_2013_05_30 AS SELECT COUNT(DISTINCT ip) AS IP FROM techbbs WHERE logdate='2013_05_30';
hive> select * from techbbs_ip_2013_05_30;
只浏览了一个页面便离开了网站的访问次数,即只浏览了一个页面便不再访问的访问次数。 这里,我们可以通过用户的IP进行分组,如果分组后的记录数只有一条,那么即为跳出用户。将这些用户的数量相加,就得出了跳出用户数,HQL代码如下:
hive>CREATE TABLE techbbs_jumper_2013_05_30 AS SELECT COUNT(1) AS jumper FROM (SELECT COUNT(ip) AS times FROM techbbs WHERE logdate='2013_05_30' GROUP BY ip HAVING times=1) e;
hive> select * from techbbs_jumper_2013_05_30;
hive>CREATE TABLE techbbs_2013_05_30 AS SELECT '2013_05_30', a.pv, b.reguser, c.ip, d.jumper FROM techbbs_pv_2013_05_30 a JOIN techbbs_reguser_2013_05_30 b ON 1=1 JOIN techbbs_ip_2013_05_30 c ON 1=1 JOIN techbbs_jumper_2013_05_30 d ON 1=1;
报错:FAILED: SemanticException Cartesian products are disabled for safety reasons. If you know what you are doing, please sethive.strict.checks.cartesian.product to false and that hive.mapred.mode is not st to 'strict' to proceed.
Note that if you may get errors or incorrect results if you make a mistake while using some of the unsafe features.
解决:在SQL前面加上如下:
hive> set hive.mapred.mode=nonstrict;
hive> CREATE TABLE techbbs_2013_05_30 AS SELECT '2013_05_30', a.pv, b.reguser, c.ip, d.jumper FROM techbbs_pv_2013_05_30 a JOIN techbbs_reguser_2013_05_30 b ON 1=1 JOIN techbbs_ip_2013_05_30 c ON 1=1 JOIN techbbs_jumper_2013_05_30 d ON 1=1;
sqoop export --connect jdbc:mysql://172.30.86.231:3306/techbbs --username root --password 921027 --table techbbs_logs_stat --fields-terminated-by '\001' --export-dir '/user/hive/warehouse/techbbs_2013_05_30'
hive>show create table techbbs_2013_05_30;
LOCATION
'hdfs://bigdata11:9000/user/hive/warehouse/techbbs_2013_05_30'