In [1]:
import findspark
findspark.init()  # notebook 里必须

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, desc
from pyspark.sql.types import IntegerType, FloatType, LongType
from pyspark.sql.functions import split, regexp_extract

# ==========================
# 1️⃣ 启动 SparkSession,配置 OSS
# ==========================
spark = SparkSession.builder \
    .appName("ML-ETL-EDA") \
    .master("spark://192.168.0.186:7077") \
    .config("fs.oss.impl", "org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem") \
    .config("fs.oss.accessKeyId", "LTAI5tKJCDxHBwyXrj5XVKQs") \
    .config("fs.oss.accessKeySecret", "nPwK7jBsRxkqq6xVWutU2DCI1RIZ55") \
    .config("fs.oss.endpoint", "oss-cn-chengdu.aliyuncs.com") \
    .getOrCreate()
In [2]:
# ==========================
# 2️⃣ OSS 上的 MovieLens 文件路径
# ==========================
movies_path = "oss://spark-experiments/MovieLens 1M Dataset/movies.dat"
ratings_path = "oss://spark-experiments/MovieLens 1M Dataset/ratings.dat"
In [3]:
# ==========================
# 3️⃣ 读取 movies.dat
# ==========================
raw_movies = spark.read.text(movies_path)
movies = raw_movies.withColumn("movieId", split("value", "::").getItem(0).cast(IntegerType())) \
                   .withColumn("title", split("value", "::").getItem(1)) \
                   .withColumn("genres", split("value", "::").getItem(2)) \
                   .drop("value")


# ==========================
# 4️⃣ 读取 ratings.dat
# ==========================
raw_ratings = spark.read.text(ratings_path)
ratings = raw_ratings.withColumn("userId", split("value", "::").getItem(0).cast(IntegerType())) \
                     .withColumn("movieId", split("value", "::").getItem(1).cast(IntegerType())) \
                     .withColumn("rating", split("value", "::").getItem(2).cast(FloatType())) \
                     .withColumn("timestamp", split("value", "::").getItem(3).cast(LongType())) \
                     .drop("value")
In [4]:
# ==========================
# 5️⃣ 每个电影的平均评分 & 评分次数
# ==========================
movie_stats = ratings.groupBy("movieId").agg(
    avg("rating").alias("avg_rating"),
    count("rating").alias("num_ratings")
)

# join title
movie_stats = movie_stats.join(movies, on="movieId", how="left")

# Top 20 by count
top_by_count = movie_stats.orderBy(desc("num_ratings")).limit(20)
top_by_count.show(truncate=False)

# Top 20 by avg rating (至少 100 条评分)
top_by_avg = movie_stats.filter(col("num_ratings") >= 100).orderBy(desc("avg_rating")).limit(20)
top_by_avg.show(truncate=False)
+-------+------------------+-----------+-----------------------------------------------------+-----------------------------------+
|movieId|avg_rating        |num_ratings|title                                                |genres                             |
+-------+------------------+-----------+-----------------------------------------------------+-----------------------------------+
|2858   |4.3173862310385065|3428       |American Beauty (1999)                               |Comedy|Drama                       |
|260    |4.453694416583082 |2991       |Star Wars: Episode IV - A New Hope (1977)            |Action|Adventure|Fantasy|Sci-Fi    |
|1196   |4.292976588628763 |2990       |Star Wars: Episode V - The Empire Strikes Back (1980)|Action|Adventure|Drama|Sci-Fi|War  |
|1210   |4.022892819979188 |2883       |Star Wars: Episode VI - Return of the Jedi (1983)    |Action|Adventure|Romance|Sci-Fi|War|
|480    |3.7638473053892216|2672       |Jurassic Park (1993)                                 |Action|Adventure|Sci-Fi            |
|2028   |4.337353938937053 |2653       |Saving Private Ryan (1998)                           |Action|Drama|War                   |
|589    |4.058512646281616 |2649       |Terminator 2: Judgment Day (1991)                    |Action|Sci-Fi|Thriller             |
|2571   |4.315830115830116 |2590       |Matrix, The (1999)                                   |Action|Sci-Fi|Thriller             |
|1270   |3.9903213317847466|2583       |Back to the Future (1985)                            |Comedy|Sci-Fi                      |
|593    |4.3518231186966645|2578       |Silence of the Lambs, The (1991)                     |Drama|Thriller                     |
|1580   |3.739952718676123 |2538       |Men in Black (1997)                                  |Action|Adventure|Comedy|Sci-Fi     |
|1198   |4.477724741447892 |2514       |Raiders of the Lost Ark (1981)                       |Action|Adventure                   |
|608    |4.254675686430561 |2513       |Fargo (1996)                                         |Crime|Drama|Thriller               |
|2762   |4.406262708418057 |2459       |Sixth Sense, The (1999)                              |Thriller                           |
|110    |4.234957020057307 |2443       |Braveheart (1995)                                    |Action|Drama|War                   |
|2396   |4.127479949345715 |2369       |Shakespeare in Love (1998)                           |Comedy|Romance                     |
|1197   |4.3037100949094045|2318       |Princess Bride, The (1987)                           |Action|Adventure|Comedy|Romance    |
|527    |4.510416666666667 |2304       |Schindler's List (1993)                              |Drama|War                          |
|1617   |4.219405594405594 |2288       |L.A. Confidential (1997)                             |Crime|Film-Noir|Mystery|Thriller   |
|1265   |3.953028972783143 |2278       |Groundhog Day (1993)                                 |Comedy|Romance                     |
+-------+------------------+-----------+-----------------------------------------------------+-----------------------------------+

+-------+------------------+-----------+---------------------------------------------------------------------------+-------------------------------+
|movieId|avg_rating        |num_ratings|title                                                                      |genres                         |
+-------+------------------+-----------+---------------------------------------------------------------------------+-------------------------------+
|2019   |4.560509554140127 |628        |Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)        |Action|Drama                   |
|318    |4.554557700942973 |2227       |Shawshank Redemption, The (1994)                                           |Drama                          |
|858    |4.524966261808367 |2223       |Godfather, The (1972)                                                      |Action|Crime|Drama             |
|745    |4.52054794520548  |657        |Close Shave, A (1995)                                                      |Animation|Comedy|Thriller      |
|50     |4.517106001121705 |1783       |Usual Suspects, The (1995)                                                 |Crime|Thriller                 |
|527    |4.510416666666667 |2304       |Schindler's List (1993)                                                    |Drama|War                      |
|1148   |4.507936507936508 |882        |Wrong Trousers, The (1993)                                                 |Animation|Comedy               |
|922    |4.491489361702127 |470        |Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)                              |Film-Noir                      |
|1198   |4.477724741447892 |2514       |Raiders of the Lost Ark (1981)                                             |Action|Adventure               |
|904    |4.476190476190476 |1050       |Rear Window (1954)                                                         |Mystery|Thriller               |
|1178   |4.473913043478261 |230        |Paths of Glory (1957)                                                      |Drama|War                      |
|260    |4.453694416583082 |2991       |Star Wars: Episode IV - A New Hope (1977)                                  |Action|Adventure|Fantasy|Sci-Fi|
|1212   |4.452083333333333 |480        |Third Man, The (1949)                                                      |Mystery|Thriller               |
|750    |4.4498902706656915|1367       |Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)|Sci-Fi|War                     |
|720    |4.426940639269406 |438        |Wallace & Gromit: The Best of Aardman Animation (1996)                     |Animation                      |
|1207   |4.425646551724138 |928        |To Kill a Mockingbird (1962)                                               |Drama                          |
|3435   |4.415607985480944 |551        |Double Indemnity (1944)                                                    |Crime|Film-Noir                |
|912    |4.412822049131217 |1669       |Casablanca (1942)                                                          |Drama|Romance|War              |
|2762   |4.406262708418057 |2459       |Sixth Sense, The (1999)                                                    |Thriller                       |
|3030   |4.404651162790698 |215        |Yojimbo (1961)                                                             |Comedy|Drama|Western           |
+-------+------------------+-----------+---------------------------------------------------------------------------+-------------------------------+

In [5]:
# ==========================
# 6️⃣ 年代分析(从 title 提取年份)
# ==========================
extract_year = regexp_extract(col("title"), r".*\((\d{4})\)$", 1).cast(IntegerType())
movies2 = movies.withColumn("year", extract_year)

movie_stats2 = movie_stats.join(movies2.select("movieId","year"), on="movieId", how="left")
movie_stats2.groupBy((col("year")/10).cast(IntegerType()).alias("decade")) \
    .agg(count("*").alias("num_movies"), avg("avg_rating").alias("mean_rating")) \
    .orderBy("decade") \
    .show()
+------+----------+------------------+
|decade|num_movies|       mean_rating|
+------+----------+------------------+
|   191|         3|2.9327485380116958|
|   192|        23|3.5941454206286467|
|   193|        72|3.5837934779054827|
|   194|       120| 3.648636789432025|
|   195|       165|  3.62267923907831|
|   196|       187|3.6063844331740875|
|   197|       240| 3.437194035217785|
|   198|       592|3.1838354192243186|
|   199|      2152|3.1461028743307633|
|   200|       152| 3.050623620157244|
+------+----------+------------------+

请完成以下两个题目¶

In [ ]:
# ==========================
# 最受欢迎(评分次数最多)的电影前 20;
# ==========================
In [ ]:
# ==========================
# 平均评分最高且评分次数≥100 的电影前 20;
# ==========================
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: