How to bin in PySpark?

You could also write a PySpark UDF:

def categorizer(age):
  if age < 6:
    return "infant"
  elif age < 18:
    return "minor"
  elif age < 60:
    return "adult"
  else: 
    return "senior"

Then:

bucket_udf = udf(categorizer, StringType() )
bucketed = df.withColumn("bucket", bucket_udf("age"))

In my case I had to randomly bucket a string value column, so it required me some extra steps:

from pyspark.sql.types import LongType, IntegerType
import pyspark.sql.functions as F


buckets_number = 4    # number of buckets desired

df.withColumn("sub", F.substring(F.md5('my_col'), 0, 16)) \
  .withColumn("translate", F.translate("sub", "abcdefghijklmnopqrstuvwxyz", "01234567890123456789012345").cast(LongType())) \
  .select("my_col",
         (F.col("translate") % (buckets_number + 1)).cast(IntegerType()).alias("bucket_my_col"))
  1. hash it with MD5
  2. substring the result to 16 characters (otherwise would have a too big number in following steps)
  3. translate letters generated by MD5 in numbers
  4. apply modulo function based on the number of desired buckets

You can use Bucketizer feature transfrom from ml library in spark.

values = [("a", 23), ("b", 45), ("c", 10), ("d", 60), ("e", 56), ("f", 2), ("g", 25), ("h", 40), ("j", 33)]


df = spark.createDataFrame(values, ["name", "ages"])


from pyspark.ml.feature import Bucketizer
bucketizer = Bucketizer(splits=[ 0, 6, 18, 60, float('Inf') ],inputCol="ages", outputCol="buckets")
df_buck = bucketizer.setHandleInvalid("keep").transform(df)

df_buck.show()

output

+----+----+-------+
|name|ages|buckets|
+----+----+-------+
|   a|  23|    2.0|
|   b|  45|    2.0|
|   c|  10|    1.0|
|   d|  60|    3.0|
|   e|  56|    2.0|
|   f|   2|    0.0|
|   g|  25|    2.0|
|   h|  40|    2.0|
|   j|  33|    2.0|
+----+----+-------+

If you want names for each bucket you can use udf to create a new column with bucket names

from pyspark.sql.functions import udf
from pyspark.sql.types import *

t = {0.0:"infant", 1.0: "minor", 2.0:"adult", 3.0: "senior"}
udf_foo = udf(lambda x: t[x], StringType())
df_buck.withColumn("age_bucket", udf_foo("buckets")).show()

output

+----+----+-------+----------+
|name|ages|buckets|age_bucket|
+----+----+-------+----------+
|   a|  23|    2.0|     adult|
|   b|  45|    2.0|     adult|
|   c|  10|    1.0|     minor|
|   d|  60|    3.0|    senior|
|   e|  56|    2.0|     adult|
|   f|   2|    0.0|    infant|
|   g|  25|    2.0|     adult|
|   h|  40|    2.0|     adult|
|   j|  33|    2.0|     adult|
+----+----+-------+----------+