from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("String_Functions_Students").getOrCreate()
data = [
    ("ST001", "Kiran Kumar", "Bangalore"),
    ("ST002", "Meera Nair", "Chennai"),
    ("ST003", "Arjun Reddy", "Hyderabad"),
    ("ST004", "Sita Devi", "Mumbai")
]
columns = ["StudentID", "Name", "City"]
df = spark.createDataFrame(data, columns)
df.show(truncate=False)
df.select(concat_ws("-", "StudentID","Name","City").alias("concat_ws")).show()
df.select("Name", substring("Name", 1, 5).alias("Substring")).show()
df.select("Name", instr("Name","a").alias("Instr")).show()
df.select(trim("Name").alias("Trimmed")).show()
df.select(upper("Name").alias("Upper")).show()
df.select(lower("Name").alias("Lower")).show()
df.select(initcap("Name").alias("Initcap")).show()
df.select(regexp_replace("City","a","@").alias("RegexpReplace")).show()
df.select(regexp_extract("City","[A-Za-z]+",0).alias("RegexpExtract")).show()
df.select(split("City"," ").alias("Split")).ashow()
df.select(concat("Name", lit(" - Student")).alias("Concat")).show()
df.select(lpad("StudentID",6,"0").alias("LPAD")).show()
df.select(rpad("StudentID",6,"0").alias("RPAD")).show()
df.select(reverse("Name").alias("Reversed")).show()
df.select(repeat("Name",2).alias("Repeated")).show()
df.select("Name", ascii(substring("Name",1,1)).alias("ASCII")).show()
df.select("City", locate("o","City").alias("Locate_Pos")).show()
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("DateTime_Functions").getOrCreate()
data = [
    ("EMP001", "Karthik", "2023-01-10"),
    ("EMP002", "Priya", "2022-05-18"),
    ("EMP003", "Rohit", "2021-12-01"),
    ("EMP004", "Sneha", "2020-07-22")
]
columns = ["EmpID", "Name", "JoinDate"]
df = spark.createDataFrame(data, columns)
df.show(truncate=False)
df.select(current_date().alias("current_date")).show(1)
df.select(col("JoinDate"), date_format("JoinDate","MM-dd-yyyy").alias("date_format")).show()
df.select(col("JoinDate"), to_date("JoinDate","yyyy-MM-dd").alias("to_date")).show()
df.select(col("JoinDate"), datediff(current_date(), "JoinDate").alias("datediff")).show()
df.select(col("JoinDate"), months_between(current_date(), "JoinDate").alias("months_between")).show()
df.select(col("JoinDate"), trunc("JoinDate","Month").alias("Month_Trunc"), trunc("JoinDate","Year").alias("Year_Trunc")).show()
df.select(col("JoinDate"), add_months("JoinDate",3).alias("Add3M"), add_months("JoinDate",-3).alias("Sub3M"), date_add("JoinDate",5).alias("Add5D"), date_sub("JoinDate",5).alias("Sub5D")).show()
df.select("JoinDate", dayofweek("JoinDate").alias("DayOfWeek"), dayofmonth("JoinDate").alias("DayOfMonth"), dayofyear("JoinDate").alias("DayOfYear")).show()
df.select(current_timestamp().alias("Current_Timestamp")).show(1, truncate=False)
df_with_date = df.withColumn("date_col", to_date("JoinDate","yyyy-MM-dd"))
df_with_age = df_with_date.withColumn("AgeYears", (datediff(current_date(),col("date_col"))/365.25).cast("int"))
df_with_age.show()
df_with_age.agg(avg("AgeYears").alias("Average_Age")).show()
df_with_age.orderBy(col("AgeYears").asc()).limit(1).show()
df_with_age.orderBy(col("AgeYears").desc()).limit(1).show()
df_with_age.withColumn("Month", month("date_col")).filter((col("Month")>=1)&(col("Month")<=5)).show()
df.select("JoinDate", last_day("JoinDate").alias("Month_End")).show()
df.select("JoinDate", quarter("JoinDate").alias("Quarter")).show()
df.select("JoinDate", date_trunc("month","JoinDate").alias("Truncated_Month")).show()
from pyspark.sql.functions import from_unixtime, lit
df.select(from_unixtime(lit(1700000000),"yyyy-MM-dd HH:mm:ss").alias("FromUnix")).show()
df.select(unix_timestamp().alias("Unix_Timestamp")).show()
