Source code for featuretools.primitives.standard.transform.url.url_to_tld

from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import URL, Categorical

from featuretools.primitives.base import TransformPrimitive
from featuretools.utils.common_tld_utils import COMMON_TLDS


[docs]class URLToTLD(TransformPrimitive): """Determines the top level domain of a url. Description: Extract the top level domain of a url, using regex, and a list of common top level domains. Returns nan if the url is invalid or null. Common top level domains were pulled from this list: https://www.hayksaakian.com/most-popular-tlds/ Examples: >>> url_to_tld = URLToTLD() >>> urls = ['https://www.google.com', 'http://www.google.co.in', ... 'www.facebook.com'] >>> url_to_tld(urls).to_list() ['com', 'in', 'com'] """ name = "url_to_tld" input_types = [ColumnSchema(logical_type=URL)] return_type = ColumnSchema(logical_type=Categorical, semantic_tags={"category"}) def get_function(self): self.tlds_pattern = r"(?:\.({}))".format("|".join(COMMON_TLDS)) def url_to_domain(x): p = r"^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/?\n]+)" return x.str.extract(p, expand=False) def url_to_tld(x): domains = url_to_domain(x) df = domains.str.extractall(self.tlds_pattern) matches = df.groupby(level=0).last()[0] return matches.reindex(x.index) return url_to_tld