In [1]:
import findspark
findspark.init() # notebook 里必须
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, desc
from pyspark.sql.types import IntegerType, FloatType, LongType
from pyspark.sql.functions import split, regexp_extract
# ==========================
# 1️⃣ 启动 SparkSession,配置 OSS
# ==========================
spark = SparkSession.builder \
.appName("ML-ETL-EDA") \
.master("spark://192.168.0.186:7077") \
.config("fs.oss.impl", "org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem") \
.config("fs.oss.accessKeyId", "LTAI5tKJCDxHBwyXrj5XVKQs") \
.config("fs.oss.accessKeySecret", "nPwK7jBsRxkqq6xVWutU2DCI1RIZ55") \
.config("fs.oss.endpoint", "oss-cn-chengdu.aliyuncs.com") \
.getOrCreate()
In [2]:
# ==========================
# 2️⃣ OSS 上的 MovieLens 文件路径
# ==========================
movies_path = "oss://spark-experiments/MovieLens 1M Dataset/movies.dat"
ratings_path = "oss://spark-experiments/MovieLens 1M Dataset/ratings.dat"
In [3]:
# ==========================
# 3️⃣ 读取 movies.dat
# ==========================
raw_movies = spark.read.text(movies_path)
movies = raw_movies.withColumn("movieId", split("value", "::").getItem(0).cast(IntegerType())) \
.withColumn("title", split("value", "::").getItem(1)) \
.withColumn("genres", split("value", "::").getItem(2)) \
.drop("value")
# ==========================
# 4️⃣ 读取 ratings.dat
# ==========================
raw_ratings = spark.read.text(ratings_path)
ratings = raw_ratings.withColumn("userId", split("value", "::").getItem(0).cast(IntegerType())) \
.withColumn("movieId", split("value", "::").getItem(1).cast(IntegerType())) \
.withColumn("rating", split("value", "::").getItem(2).cast(FloatType())) \
.withColumn("timestamp", split("value", "::").getItem(3).cast(LongType())) \
.drop("value")
In [4]:
# ==========================
# 5️⃣ 每个电影的平均评分 & 评分次数
# ==========================
movie_stats = ratings.groupBy("movieId").agg(
avg("rating").alias("avg_rating"),
count("rating").alias("num_ratings")
)
# join title
movie_stats = movie_stats.join(movies, on="movieId", how="left")
# Top 20 by count
top_by_count = movie_stats.orderBy(desc("num_ratings")).limit(20)
top_by_count.show(truncate=False)
# Top 20 by avg rating (至少 100 条评分)
top_by_avg = movie_stats.filter(col("num_ratings") >= 100).orderBy(desc("avg_rating")).limit(20)
top_by_avg.show(truncate=False)
+-------+------------------+-----------+-----------------------------------------------------+-----------------------------------+ |movieId|avg_rating |num_ratings|title |genres | +-------+------------------+-----------+-----------------------------------------------------+-----------------------------------+ |2858 |4.3173862310385065|3428 |American Beauty (1999) |Comedy|Drama | |260 |4.453694416583082 |2991 |Star Wars: Episode IV - A New Hope (1977) |Action|Adventure|Fantasy|Sci-Fi | |1196 |4.292976588628763 |2990 |Star Wars: Episode V - The Empire Strikes Back (1980)|Action|Adventure|Drama|Sci-Fi|War | |1210 |4.022892819979188 |2883 |Star Wars: Episode VI - Return of the Jedi (1983) |Action|Adventure|Romance|Sci-Fi|War| |480 |3.7638473053892216|2672 |Jurassic Park (1993) |Action|Adventure|Sci-Fi | |2028 |4.337353938937053 |2653 |Saving Private Ryan (1998) |Action|Drama|War | |589 |4.058512646281616 |2649 |Terminator 2: Judgment Day (1991) |Action|Sci-Fi|Thriller | |2571 |4.315830115830116 |2590 |Matrix, The (1999) |Action|Sci-Fi|Thriller | |1270 |3.9903213317847466|2583 |Back to the Future (1985) |Comedy|Sci-Fi | |593 |4.3518231186966645|2578 |Silence of the Lambs, The (1991) |Drama|Thriller | |1580 |3.739952718676123 |2538 |Men in Black (1997) |Action|Adventure|Comedy|Sci-Fi | |1198 |4.477724741447892 |2514 |Raiders of the Lost Ark (1981) |Action|Adventure | |608 |4.254675686430561 |2513 |Fargo (1996) |Crime|Drama|Thriller | |2762 |4.406262708418057 |2459 |Sixth Sense, The (1999) |Thriller | |110 |4.234957020057307 |2443 |Braveheart (1995) |Action|Drama|War | |2396 |4.127479949345715 |2369 |Shakespeare in Love (1998) |Comedy|Romance | |1197 |4.3037100949094045|2318 |Princess Bride, The (1987) |Action|Adventure|Comedy|Romance | |527 |4.510416666666667 |2304 |Schindler's List (1993) |Drama|War | |1617 |4.219405594405594 |2288 |L.A. Confidential (1997) |Crime|Film-Noir|Mystery|Thriller | |1265 |3.953028972783143 |2278 |Groundhog Day (1993) |Comedy|Romance | +-------+------------------+-----------+-----------------------------------------------------+-----------------------------------+ +-------+------------------+-----------+---------------------------------------------------------------------------+-------------------------------+ |movieId|avg_rating |num_ratings|title |genres | +-------+------------------+-----------+---------------------------------------------------------------------------+-------------------------------+ |2019 |4.560509554140127 |628 |Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) |Action|Drama | |318 |4.554557700942973 |2227 |Shawshank Redemption, The (1994) |Drama | |858 |4.524966261808367 |2223 |Godfather, The (1972) |Action|Crime|Drama | |745 |4.52054794520548 |657 |Close Shave, A (1995) |Animation|Comedy|Thriller | |50 |4.517106001121705 |1783 |Usual Suspects, The (1995) |Crime|Thriller | |527 |4.510416666666667 |2304 |Schindler's List (1993) |Drama|War | |1148 |4.507936507936508 |882 |Wrong Trousers, The (1993) |Animation|Comedy | |922 |4.491489361702127 |470 |Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) |Film-Noir | |1198 |4.477724741447892 |2514 |Raiders of the Lost Ark (1981) |Action|Adventure | |904 |4.476190476190476 |1050 |Rear Window (1954) |Mystery|Thriller | |1178 |4.473913043478261 |230 |Paths of Glory (1957) |Drama|War | |260 |4.453694416583082 |2991 |Star Wars: Episode IV - A New Hope (1977) |Action|Adventure|Fantasy|Sci-Fi| |1212 |4.452083333333333 |480 |Third Man, The (1949) |Mystery|Thriller | |750 |4.4498902706656915|1367 |Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)|Sci-Fi|War | |720 |4.426940639269406 |438 |Wallace & Gromit: The Best of Aardman Animation (1996) |Animation | |1207 |4.425646551724138 |928 |To Kill a Mockingbird (1962) |Drama | |3435 |4.415607985480944 |551 |Double Indemnity (1944) |Crime|Film-Noir | |912 |4.412822049131217 |1669 |Casablanca (1942) |Drama|Romance|War | |2762 |4.406262708418057 |2459 |Sixth Sense, The (1999) |Thriller | |3030 |4.404651162790698 |215 |Yojimbo (1961) |Comedy|Drama|Western | +-------+------------------+-----------+---------------------------------------------------------------------------+-------------------------------+
In [5]:
# ==========================
# 6️⃣ 年代分析(从 title 提取年份)
# ==========================
extract_year = regexp_extract(col("title"), r".*\((\d{4})\)$", 1).cast(IntegerType())
movies2 = movies.withColumn("year", extract_year)
movie_stats2 = movie_stats.join(movies2.select("movieId","year"), on="movieId", how="left")
movie_stats2.groupBy((col("year")/10).cast(IntegerType()).alias("decade")) \
.agg(count("*").alias("num_movies"), avg("avg_rating").alias("mean_rating")) \
.orderBy("decade") \
.show()
+------+----------+------------------+ |decade|num_movies| mean_rating| +------+----------+------------------+ | 191| 3|2.9327485380116958| | 192| 23|3.5941454206286467| | 193| 72|3.5837934779054827| | 194| 120| 3.648636789432025| | 195| 165| 3.62267923907831| | 196| 187|3.6063844331740875| | 197| 240| 3.437194035217785| | 198| 592|3.1838354192243186| | 199| 2152|3.1461028743307633| | 200| 152| 3.050623620157244| +------+----------+------------------+
请完成以下两个题目¶
In [ ]:
# ==========================
# 最受欢迎(评分次数最多)的电影前 20;
# ==========================
In [ ]:
# ==========================
# 平均评分最高且评分次数≥100 的电影前 20;
# ==========================
In [ ]:
In [ ]:
In [ ]:
In [ ]: