!pip install pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[2]").appName("SparkByExamples.com").getOrCreate()
from pyspark import SparkContext
sc = spark.sparkContext
rdd=sc.parallelize([1,2,3,4,5,6,7,8,9,10])
rdd1=sc.parallelize([1,2,3])
mapped_rdd=rdd1.map(lambda x:x*2)
print(mapped_rdd.collect())
rdd2=sc.parallelize([1,2,3,4,5,6])
fil_rdd=rdd2.filter(lambda x:x%2==0)
print(fil_rdd.collect())
rdd3=sc.parallelize([2,3,4])
flat_rdd=rdd3.flatMap(lambda x:[x,x*10,x+5])
print(flat_rdd.collect())
rdd_uni=rdd1.union(rdd2)
print(rdd_uni.collect())
rdd_inter=rdd1.intersection(rdd2)
print(rdd_inter.collect())
rdd3=sc.parallelize([1,2,4,8,9,10])
rdd_sub=rdd3.subtract(rdd2)
print(rdd_sub.collect())
rdd4=sc.parallelize([1,2,2,3,3,4,5,10,50,5])
rdd_dis=rdd4.distinct()
print(rdd_dis.collect())
rdd = sc.parallelize([10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
sampled_rdd = rdd.sample(False, 0.3, seed=None)
print( sampled_rdd.collect())
rdd = sc.parallelize([("a", 1), ("b", 2), ("a", 3), ("b", 4)])
result = rdd.reduceByKey(lambda x, y: x + y)
print(result.collect())
rdd = sc.parallelize([(3, "three"), (1, "one"), (2, "two")])
result = rdd.sortByKey()
print(result.collect())
rdd = sc.parallelize([("a", 1), ("b", 2), ("a", 3), ("b", 4)])
grouped = rdd.groupByKey()
result = [(k, list(v)) for k, v in grouped.collect()]
print(result)
rdd1 = sc.parallelize([1, 2])
rdd2 = sc.parallelize(["a", "b"])
cartesian_rdd = rdd1.cartesian(rdd2)
print(cartesian_rdd.collect())
rdd = sc.parallelize([("a", 2), ("b", 3)])
mapped_values_rdd = rdd.mapValues(lambda x: x * x)
print(mapped_values_rdd.collect())
rdd1 = sc.parallelize([1, 2, 3])
rdd2 = sc.parallelize(["a", "b", "c"])
zipped = rdd1.zip(rdd2)
print("Zip:", zipped.collect())
rdd = sc.parallelize(range(1, 21), 6)
coalesced = rdd.coalesce(3)
print("Partitions before:", rdd.getNumPartitions())
print("Partitions after:", coalesced.getNumPartitions())
rdd = sc.parallelize([1, 2, 3, 4, 5])
print("Original RDD:", rdd.collect())
glommed = rdd.glom()
print("Glom:", glommed.collect())
rdd1 = sc.parallelize([1, 2])
rdd2 = sc.parallelize(["a", "b"])
cartesian_rdd = rdd1.cartesian(rdd2)
print(cartesian_rdd.collect())
rdd = sc.parallelize([1,2,3,4,5],3)
rdd.take(3)
glommed=rdd.glom()
glommed.collect()
