In [1]:
import findspark
findspark.init()   # 自动查找 SPARK_HOME 并加到 sys.path
findspark.find()
Out[1]:
'C:\\spark-3.5.6-bin-hadoop3'
In [6]:
from pyspark.sql import SparkSession
import random

# 连接到 Spark 集群
spark = (
    SparkSession.builder
    .appName("SparkPi")
    .master("spark://192.168.0.186:7077")  # 这里假设你的master在192.168.0.186
    .getOrCreate()
)

sc = spark.sparkContext

# ---------------- 计算 Pi ----------------
def inside(_):
    x, y = random.random(), random.random()
    return 1 if x*x + y*y < 1 else 0

num_samples = 1000000

# 分发到集群上做并行计算
count = sc.parallelize(range(0, num_samples), numSlices=10) \
          .map(inside) \
          .reduce(lambda a, b: a + b)

pi = 4.0 * count / num_samples
print(f"Pi is roughly {pi}")

spark.stop()
Pi is roughly 3.14108
In [ ]:
 
In [5]:
import pyspark
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("MyApp")
    .master("spark://192.168.0.186:7077")  # 这里可以改成 local[*] 或集群 master
    .getOrCreate()
)

df = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
df.show()
+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  2|  Bob|
+---+-----+

In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: