Source code for featuretools.primitives.standard.transform.email.is_free_email_domain
import numpy as np
import pandas as pd
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import BooleanNullable, EmailAddress
from featuretools.primitives.base import TransformPrimitive
[docs]class IsFreeEmailDomain(TransformPrimitive):
    """Determines if an email address is from a free email domain.
    Description:
        EmailAddress input should be a string. Will return Nan
        if an invalid email address is provided, or if the input is
        not a string. The list of free email domains used in this primitive
        was obtained from https://github.com/willwhite/freemail/blob/master/data/free.txt.
    Examples:
        >>> is_free_email_domain = IsFreeEmailDomain()
        >>> is_free_email_domain(['[email protected]', '[email protected]']).tolist()
        [True, False]
    """
    name = "is_free_email_domain"
    input_types = [ColumnSchema(logical_type=EmailAddress)]
    return_type = ColumnSchema(logical_type=BooleanNullable)
    filename = "free_email_provider_domains.txt"
    def get_function(self):
        file_path = self.get_filepath(self.filename)
        free_domains = pd.read_csv(file_path, header=None, names=["domain"])
        free_domains["domain"] = free_domains.domain.str.strip()
        def is_free_email_domain(emails):
            # if the input is empty return an empty Series
            if len(emails) == 0:
                return pd.Series([], dtype="category")
            emails_df = pd.DataFrame({"email": emails})
            # if all emails are NaN expand won't propogate NaNs and will fail on indexing
            if emails_df["email"].isnull().all():
                emails_df["domain"] = np.nan
            else:
                # .str.strip() and .str.split() return NaN for NaN values and propogate NaNs into new columns
                emails_df["domain"] = (
                    emails_df["email"].str.strip().str.split("@", expand=True)[1]
                )
            emails_df["is_free"] = emails_df["domain"].isin(free_domains["domain"])
            # if there are any NaN domain values, change the series type to allow for
            # both bools and NaN values and set is_free to NaN for the NaN domains
            if emails_df["domain"].isnull().values.any():
                emails_df["is_free"] = emails_df["is_free"].astype("object")
                emails_df.loc[emails_df["domain"].isnull(), "is_free"] = np.nan
            return emails_df.is_free.values
        return is_free_email_domain