In [1]:
import findspark
findspark.init() # 自动查找 SPARK_HOME 并加到 sys.path
findspark.find()
Out[1]:
'C:\\spark-3.5.6-bin-hadoop3'
In [6]:
from pyspark.sql import SparkSession
import random
# 连接到 Spark 集群
spark = (
SparkSession.builder
.appName("SparkPi")
.master("spark://192.168.0.186:7077") # 这里假设你的master在192.168.0.186
.getOrCreate()
)
sc = spark.sparkContext
# ---------------- 计算 Pi ----------------
def inside(_):
x, y = random.random(), random.random()
return 1 if x*x + y*y < 1 else 0
num_samples = 1000000
# 分发到集群上做并行计算
count = sc.parallelize(range(0, num_samples), numSlices=10) \
.map(inside) \
.reduce(lambda a, b: a + b)
pi = 4.0 * count / num_samples
print(f"Pi is roughly {pi}")
spark.stop()
Pi is roughly 3.14108
In [ ]:
In [5]:
import pyspark
from pyspark.sql import SparkSession
spark = (
SparkSession.builder
.appName("MyApp")
.master("spark://192.168.0.186:7077") # 这里可以改成 local[*] 或集群 master
.getOrCreate()
)
df = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
df.show()
+---+-----+ | id| name| +---+-----+ | 1|Alice| | 2| Bob| +---+-----+
In [ ]:
In [ ]:
In [ ]:
In [ ]: