Source code for featuretools.primitives.standard.aggregation.n_most_common_frequency

import numpy as np
import pandas as pd
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import Categorical

from featuretools.primitives.base import AggregationPrimitive


[docs]class NMostCommonFrequency(AggregationPrimitive):
    """Determines the frequency of the n most common items.

    Args:
        n (int): defines "n" in "n most common". Defaults to 3.
        skipna (bool): Determines if to use NA/null values.
            Defaults to True to skip NA/null.

    Description:
        Given a list, find the n most common items, and return a series
        showing the frequency of each item. If the list has less than n unique
        values, the resulting series will be padded with nan.

    Examples:
        >>> n_most_common_frequency = NMostCommonFrequency()
        >>> n_most_common_frequency([1, 1, 1, 2, 2, 3, 4, 4]).to_list()
        [3, 2, 2]

        We can increase n to include more items.

        >>> n_most_common_frequency = NMostCommonFrequency(4)
        >>> n_most_common_frequency([1, 1, 1, 2, 2, 3, 4, 4]).to_list()
        [3, 2, 2, 1]

        NaNs are skipped by default.

        >>> n_most_common_frequency = NMostCommonFrequency(3)
        >>> n_most_common_frequency([1, 1, 1, 2, 2, 3, 4, 4, None, None, None]).to_list()
        [3, 2, 2]

        However, the way NaNs are treated can be controlled.

        >>> n_most_common_frequency = NMostCommonFrequency(3, skipna=False)
        >>> n_most_common_frequency([1, 1, 1, 2, 2, 3, 4, 4, None, None, None]).to_list()
        [3, 3, 2]
    """

    name = "n_most_common_frequency"
    input_types = [ColumnSchema(semantic_tags={"category"})]
    return_type = ColumnSchema(logical_type=Categorical, semantic_tags={"category"})

[docs]    def __init__(self, n=3, skipna=True):
        self.n = n
        self.number_output_features = n
        self.skipna = skipna

    def get_function(self):
        def n_most_common_frequency(data, n=self.n):
            frequencies = data.value_counts(dropna=self.skipna)
            n_most_common = frequencies.iloc[0:n]
            nan_add = n - frequencies.shape[0]
            if nan_add > 0:
                n_most_common = pd.concat(
                    [n_most_common, pd.Series([np.nan] * nan_add)],
                )
            return n_most_common

        return n_most_common_frequency
Table of Contents

Quick search

Source code for featuretools.primitives.standard.aggregation.n_most_common_frequency