Please enable JavaScript.
Coggle requires JavaScript to display documents.
PySpark - Coggle Diagram
PySpark
pyspark.ml
import Pipeline
flights_pipe = Pipeline(stages=[dest_indexer, dest_encoder, carr_indexer, carr_encoder, vec_assembler])
-
training, test = piped_data.randomSplit([.6,.4]) (train 0.6, test 0.4)
-
-
-
-
-
-
grid = grid.addGrid(lr.regParam, np.arange(0, .1, .01))
grid = grid.addGrid(lr.elasticNetParam, [0,1])
-
cv = tune.CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
-
-
-
weighted_precision = multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: "weightedPrecision"})
Preprocessing
change type to numeric
data.withColumn('col', data.col.cast('integer'/'doubles')
one hot encoding
from pyspark.ml.feature import StringIndexer, OneHotEncoder
flights_indexed = StringIndexer(inputCol = 'carrier', outputCol ='carrier_index').fit(flights_indexed).transform(flights_indexed)
OneHotEncoder(inputCol = 'carrier_index', outputCol ='carrier_fact')
-
Text to Tables
-
-
books = books.withColumn('text', regexp_replace(books.text, REGEX, ' '))
Text to Tokens
-
books = Tokenizer(inputCol = 'text', outputCol = 'tokens').transform(books)
-
-
-
Bucketizer
bucketizer = Bucketizer(splits=[3500,4000,4500,6000], inputCol='rpm',outputCol=)
Create connection
SparkContext() as sc
start Spark DataFrame
SparkSession as spark
-
-
-
-
spark.read.csv(file_path, header = True, schema = schema, inferSchema = T/F, nullValue='NA'))
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
schema = StructType([StructField("id", IntegerType()),
-
-
-
-
-