Source code for nlp_primitives.stopword_count

# -*- coding: utf-8 -*-
import nltk
import numpy as np
import pandas as pd
from featuretools.primitives.base import TransformPrimitive
from featuretools.variable_types import Numeric, Text


[docs]class StopwordCount(TransformPrimitive): """Determines number of stopwords in a string. Description: Given list of strings, determine the number of stopwords characters in each string. Looks for any of the English stopwords defined in `nltk.corpus.stopwords`. Case insensitive. If a string is missing, return `NaN`. Examples: >>> x = ['This is a test string.', 'This is second string', 'third string'] >>> stopword_count = StopwordCount() >>> stopword_count(x).tolist() [3, 2, 0] """ name = "stopword_count" input_types = [Text] return_type = Numeric default_value = 0 def get_function(self): def stopword_count(array): li = [] try: swords = set(nltk.corpus.stopwords.words('english')) except LookupError: nltk.download('stopwords') swords = set(nltk.corpus.stopwords.words('english')) try: tokenizer = nltk.tokenize.word_tokenize except LookupError: nltk.download('punkt') tokenizer = nltk.tokenize.word_tokenize for el in array: if pd.isnull(el): li.append(np.nan) else: words = tokenizer(el) count = len([word for word in words if word.lower() in swords]) li.append(count) return pd.Series(li) return stopword_count