pre-commit
clone
your repo
git clone https://github.com/dafrenchyman/PythonProjectTemplate.git
README.md
pyspark
to the project
pip-compile
on requirements.txt
to
add the new pyspark
dependency
main
import sys
def main():
return
if __name__ == "__main__":
sys.exit(main())
# Setup Spark
spark = SparkSession.builder.master("local[*]").getOrCreate()
SparkSession
(ALT +
ENTER)
from pyspark.sql import SparkSession
fisher_df = spark.read.csv(temp_csv_file, inferSchema="true")
pyspark.pandas.read_csv
header
option
.toDF
fisher_df.show()
fisher_df.createOrReplaceTempView("fisher")
fisher_avg_df = spark.sql("""
SELECT
AVG(sepal_length) AS avg_sepal_length
, ...
FROM fisher"""
)
SELECT
class
, AVG(sepal_length) AS avg_sepal_length
, ...
FROM fisher
GROUP BY class
.withColumn()
pyspark.sql.functions.rand
SELECT
...
FROM fisher
ORDER BY random_column_name
fisher_random_df.show()
class
column
RandomForestClassifier
VectorAssembler
vector_assembler = VectorAssembler(
inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
outputCol="features",
)
fisher_df = vector_assembler.transform(fisher_df)
fisher_df.show()
StringIndexer
label_indexer = StringIndexer(inputCol="class", outputCol="class_idx")
label_indexer_model = label_indexer.fit(fisher_df)
fisher_df = label_indexer_model.transform(fisher_df)
fisher_df.show()
RandomForestClassifier
random_forest = RandomForestClassifier(
labelCol="class_idx",
featuresCol="features",
)
random_forest_model = random_forest.fit(fisher_df)
fisher_df_predicted = random_forest_model.transform(fisher_df)
fisher_df_predicted.show()
fisher_df_predicted.createOrReplaceTempView("predicted")
fisher_df_accuracy = spark.sql(
"""
SELECT
SUM(correct)/COUNT(*) AS accuracy
FROM
(SELECT
CASE WHEN prediction == class_idx THEN 1
ELSE 0 END AS correct
FROM predicted) AS TMP
"""
)
fisher_df_accuracy.show()
pipeline = Pipeline(
stages=[
vector_assembler,
label_indexer,
random_forest,
]
)
model = pipeline.fit(fisher_df)
fisher_df_predicted = model.transform(fisher_df)
fisher_df_predicted.show()
pre-commit