Source code for featuretools.primitives.standard.transform.natural_language.num_words

import re
from string import punctuation
from typing import Optional

import pandas as pd
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import IntegerNullable, NaturalLanguage

from featuretools.primitives.base import TransformPrimitive
from featuretools.primitives.standard.transform.natural_language.constants import (
    DELIMITERS,
)
from featuretools.utils.gen_utils import Library


[docs]class NumWords(TransformPrimitive): """Determines the number of words in a string. Words are sequences of characters delimited by whitespace. Examples: >>> num_words = NumWords() >>> num_words(['This is a string', ... 'Two words', ... 'no-spaces', ... 'Also works with sentences. Second sentence!']).tolist() [4, 2, 1, 6] """ name = "num_words" input_types = [ColumnSchema(logical_type=NaturalLanguage)] return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"}) compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the number of words in {}" def get_function(self): def word_counter(array): def _get_number_of_words(elem: Optional[str]): """Returns the number of words in given element, or pd.NA given null input""" if pd.isna(elem): return pd.NA return sum( 1 for word in re.split(DELIMITERS, elem) if word.strip(punctuation) ) return array.apply(_get_number_of_words) return word_counter