In [1]:
import findspark
findspark.init()  # notebook 里必须

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, desc
from pyspark.sql.types import IntegerType, FloatType, LongType
from pyspark.sql.functions import split, regexp_extract

# ==========================
# 1️⃣ 启动 SparkSession,配置 OSS
# ==========================
spark = SparkSession.builder \
    .appName("ML-ETL-EDA") \
    .master("spark://192.168.0.186:7077") \
    .config("fs.oss.impl", "org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem") \
    .config("fs.oss.accessKeyId", "LTAI5tKJCDxHBwyXrj5XVKQs") \
    .config("fs.oss.accessKeySecret", "nPwK7jBsRxkqq6xVWutU2DCI1RIZ55") \
    .config("fs.oss.endpoint", "oss-cn-chengdu.aliyuncs.com") \
    .getOrCreate()
In [2]:
# ==========================
# 2️⃣ OSS 上的 MovieLens 文件路径
# ==========================
movies_path = "oss://spark-experiments/MovieLens 1M Dataset/movies.dat"
ratings_path = "oss://spark-experiments/MovieLens 1M Dataset/ratings.dat"
users_path = "oss://spark-experiments/MovieLens 1M Dataset/users.dat"
In [3]:
# ==========================
# 3️⃣ 读取 movies.dat
# ==========================
raw_movies = spark.read.text(movies_path)
movies = raw_movies.withColumn("movieId", split("value", "::").getItem(0).cast(IntegerType())) \
                   .withColumn("title", split("value", "::").getItem(1)) \
                   .withColumn("genres", split("value", "::").getItem(2)) \
                   .drop("value")


# ==========================
# 4️⃣ 读取 ratings.dat
# ==========================
raw_ratings = spark.read.text(ratings_path)
ratings = raw_ratings.withColumn("userId", split("value", "::").getItem(0).cast(IntegerType())) \
                     .withColumn("movieId", split("value", "::").getItem(1).cast(IntegerType())) \
                     .withColumn("rating", split("value", "::").getItem(2).cast(FloatType())) \
                     .withColumn("timestamp", split("value", "::").getItem(3).cast(LongType())) \
                     .drop("value")

# ==========================
# 5️⃣ 读取 users.dat
# ==========================
raw_users = spark.read.text(users_path)
users = raw_users.withColumn("userId", split("value", "::").getItem(0).cast(IntegerType())) \
                 .withColumn("gender", split("value", "::").getItem(1)) \
                 .withColumn("age", split("value", "::").getItem(2).cast(IntegerType())) \
                 .withColumn("occupation", split("value", "::").getItem(3).cast(IntegerType())) \
                 .withColumn("zip", split("value", "::").getItem(4)) \
                 .drop("value")
In [ ]: