Search⌘ K
AI Features

Solution: Data Transformation

Explore how to transform data effectively using PySpark and Pandas. This lesson guides you through imputing missing values, grouping and aggregating data, selecting columns, and summarizing statistics. You will learn to convert DataFrame rows into native Python dictionaries and calculate insights on filtered data, enhancing your data processing skills with hands-on examples.

We'll cover the following...

Task

Perform summary statistics on the review_text” and vote` columns.

Solution

def impute_NAN_values(df,columnName,value):
    """Replace the NaN values"""
    df = df.fillna({columnName: value})
    return df

def show_vote_stat(df: SparkDf) -> None:
    """
    Show summary status about the vote
    :param df: A Dataframe having asin and vote column
    :return: No Return
    """
    summary_df = (
        df
        .groupby("asin")
        .agg(fn.mean(col("vote")).alias("mean_vote"))
        .select("mean_vote")
        .summary("count", "min", "25%", "75%", "max")
    )

    summary = summary_df.rdd.map(lambda row: row.asDict(recursive=True)).collect()
    pprint(summary)

def show_review_text_stat(df: SparkDf) -> None:
    """
    Show general Stats for review text length
    :param df: DataFrame
    :return: Nothing
    """
    summary_df = (
        df
        .filter(col("review_text_len") > 0)
        .select('review_text_len')
        .summary("count", "min", "25%", "75%", "max")
    )
    summary = summary_df.rdd.map(lambda row: row.asDict(recursive=True)).collect()
    print("Review Length Stat")
    pprint(summary)
    weired_reviews = df.filter(col('review_text_len') <= 1).count()
    print(f"Reviews with length one or less: {weired_reviews}")

Solution of challenge data transformation

Explanation

  • Line 3: We use the .fillna method to impute the
...