2015-12-10

使用Spark分析author—keyword数据集

数据集来自唐杰Aminer dataset中自2006-2011中抽取的DM顶级会议的paper, 其中user为paper author, 而item 为出现在paper中的keyword

dataset = sc.textFile("hdfs:///dm/author_keyword/author_keyword.txt")
user_fields = dataset.map(lambda line: line.split(" "))
# dataset 
num_users = user_fields.map(lambda fields: fields[0]).distinct().count()
# num_users： 27130
num_keywords = user_fields.map(lambda field: field[1]).distinct().count()
# num_keywords： 13543
num_ratings = user_fields.map(lambda field: field[2]).distinct().count()
#num_ratings: 30
ratings = user_fields.map(lambda field: int(field[2]))
max_rating = ratings.reduce(lambda x,y : max(x, y))
min_rating = ratings.reduce(lambda x,y : min(x, y))
#max : 52  min:1
num_ratings = user_fields.count()
# num_ratings: 703320 总记录条数
mean_rating = ratings.reduce(lambda x,y: x+y) *1.0 / num_ratings
#mean_rating： 1.113
ratings_per_user = num_ratings / num_users
# ratings_per_user： 25   平均每个用户和25条记录有关
median_rating = np.median(ratings.collect())
#median_rating ： 1.0  rating的中位数为1
ratings_per_keyword = num_ratings / num_keywords
#ratings_per_keyword： 51
ratings.stats()
#  (count: 703320, mean: 1.11235852812, stdev: 0.529708031287, max: 52.0, min: 1.0)
count_by_rating = ratings.countByValue()
# {1: 650410, 2: 39472, 3: 7984, 4: 2731, 5: 1178, 6: 629, 7: 363, 8: 193, 9: 103, 10: 80, 11: 48, 12: 27, 13: 21, 14: 18, 15: 12, 16: 12, 17: 10, 18: 3, 19: 6, 20: 4, 21: 3, 22: 2, 24: 1, 27: 1, 29: 3, 30: 1, 33: 2, 34: 1, 44: 1, 52: 1}
#绘图
x_axis = np.array(count_by_rating.keys())
y_axis = np.array([float(c) for c in count_by_rating.values()])
y_axis_normed = y_axis / y_axis.sum()
pos = np.arange(len(x_axis))
width =1.0
ax = plt.axes()
ax.set_xticks(pos +(width /2))
ax.set_xticklabels(x_axis)
plt.bar(pos, y_axis_normed, width, color='lightblue')


user_rating_grouped = user_fields.map(lambda fields:(int(fields[0]), int (fields[2]))).groupByKey()
#user_rating_grouped: PythonRDD[58] at RDD at PythonRDD.scala:43
user_ratings_byuser = user_ratings_grouped.map(lambda (k, v): (k, len(v)))
user_ratings_byuser_local = user_ratings_byuser.map(lambda (k, v): v).collect()
#各author rating次数直方图
hist(user_ratings_byuser_local, bins=200, color='lightblue', normed=True)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(16,10)

rating分布
author rating次数分布直方图