import dask.dataframe as dd
import numpy as np
import pandas as pd
from featuretools.primitives.base.transform_primitive_base import (
TransformPrimitive
)
from featuretools.utils import convert_time_units
from featuretools.utils.entity_utils import replace_latlong_nan
from featuretools.variable_types import (
Boolean,
DateOfBirth,
Datetime,
DatetimeTimeIndex,
LatLong,
Numeric,
Ordinal,
Text,
Variable
)
class IsNull(TransformPrimitive):
"""Determines if a value is null.
Examples:
>>> is_null = IsNull()
>>> is_null([1, None, 3]).tolist()
[False, True, False]
"""
name = "is_null"
input_types = [Variable]
return_type = Boolean
dask_compatible = True
def get_function(self):
def isnull(array):
if isinstance(array, dd.Series):
return dd.Series.isnull(array)
else:
return pd.isnull(pd.Series(array))
return isnull
[docs]class Absolute(TransformPrimitive):
"""Computes the absolute value of a number.
Examples:
>>> absolute = Absolute()
>>> absolute([3.0, -5.0, -2.4]).tolist()
[3.0, 5.0, 2.4]
"""
name = "absolute"
input_types = [Numeric]
return_type = Numeric
dask_compatible = True
def get_function(self):
return np.absolute
[docs]class TimeSincePrevious(TransformPrimitive):
"""Compute the time since the previous entry in a list.
Args:
unit (str): Defines the unit of time to count from.
Defaults to Seconds. Acceptable values:
years, months, days, hours, minutes, seconds, milliseconds, nanoseconds
Description:
Given a list of datetimes, compute the time in seconds elapsed since
the previous item in the list. The result for the first item in the
list will always be `NaN`.
Examples:
>>> from datetime import datetime
>>> time_since_previous = TimeSincePrevious()
>>> dates = [datetime(2019, 3, 1, 0, 0, 0),
... datetime(2019, 3, 1, 0, 2, 0),
... datetime(2019, 3, 1, 0, 3, 0),
... datetime(2019, 3, 1, 0, 2, 30),
... datetime(2019, 3, 1, 0, 10, 0)]
>>> time_since_previous(dates).tolist()
[nan, 120.0, 60.0, -30.0, 450.0]
"""
name = "time_since_previous"
input_types = [DatetimeTimeIndex]
return_type = Numeric
[docs] def __init__(self, unit="seconds"):
self.unit = unit.lower()
def get_function(self):
def pd_diff(values):
return convert_time_units(values.diff().apply(lambda x: x.total_seconds()), self.unit)
return pd_diff
[docs]class Day(TransformPrimitive):
"""Determines the day of the month from a datetime.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2019, 3, 3),
... datetime(2019, 3, 31)]
>>> day = Day()
>>> day(dates).tolist()
[1, 3, 31]
"""
name = "day"
input_types = [Datetime]
return_type = Ordinal
dask_compatible = True
def get_function(self):
def day(vals):
if isinstance(vals, dd.Series):
return vals.dt.day
return pd.DatetimeIndex(vals).day.values
return day
[docs]class Hour(TransformPrimitive):
"""Determines the hour value of a datetime.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2019, 3, 3, 11, 10, 50),
... datetime(2019, 3, 31, 19, 45, 15)]
>>> hour = Hour()
>>> hour(dates).tolist()
[0, 11, 19]
"""
name = "hour"
input_types = [Datetime]
return_type = Ordinal
dask_compatible = True
def get_function(self):
def hour(vals):
if isinstance(vals, dd.Series):
return vals.dt.hour
return pd.DatetimeIndex(vals).hour.values
return hour
[docs]class Second(TransformPrimitive):
"""Determines the seconds value of a datetime.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2019, 3, 3, 11, 10, 50),
... datetime(2019, 3, 31, 19, 45, 15)]
>>> second = Second()
>>> second(dates).tolist()
[0, 50, 15]
"""
name = "second"
input_types = [Datetime]
return_type = Numeric
dask_compatible = True
def get_function(self):
def second(vals):
if isinstance(vals, dd.Series):
return vals.dt.second
return pd.DatetimeIndex(vals).second.values
return second
[docs]class Minute(TransformPrimitive):
"""Determines the minutes value of a datetime.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2019, 3, 3, 11, 10, 50),
... datetime(2019, 3, 31, 19, 45, 15)]
>>> minute = Minute()
>>> minute(dates).tolist()
[0, 10, 45]
"""
name = "minute"
input_types = [Datetime]
return_type = Numeric
dask_compatible = True
def get_function(self):
def minute(vals):
if isinstance(vals, dd.Series):
return vals.dt.minute
return pd.DatetimeIndex(vals).minute.values
return minute
[docs]class Week(TransformPrimitive):
"""Determines the week of the year from a datetime.
Description:
Returns the week of the year from a datetime value. The first week
of the year starts on January 1, and week numbers increment each
Monday.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 1, 3),
... datetime(2019, 6, 17, 11, 10, 50),
... datetime(2019, 11, 30, 19, 45, 15)]
>>> week = Week()
>>> week(dates).tolist()
[1, 25, 48]
"""
name = "week"
input_types = [Datetime]
return_type = Ordinal
dask_compatible = True
def get_function(self):
def week(vals):
if isinstance(vals, dd.Series):
return vals.dt.week
return pd.DatetimeIndex(vals).week.values
return week
[docs]class Month(TransformPrimitive):
"""Determines the month value of a datetime.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2019, 6, 17, 11, 10, 50),
... datetime(2019, 11, 30, 19, 45, 15)]
>>> month = Month()
>>> month(dates).tolist()
[3, 6, 11]
"""
name = "month"
input_types = [Datetime]
return_type = Ordinal
dask_compatible = True
def get_function(self):
def month(vals):
if isinstance(vals, dd.Series):
return vals.dt.month
return pd.DatetimeIndex(vals).month.values
return month
[docs]class Year(TransformPrimitive):
"""Determines the year value of a datetime.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2048, 6, 17, 11, 10, 50),
... datetime(1950, 11, 30, 19, 45, 15)]
>>> year = Year()
>>> year(dates).tolist()
[2019, 2048, 1950]
"""
name = "year"
input_types = [Datetime]
return_type = Ordinal
dask_compatible = True
def get_function(self):
def year(vals):
if isinstance(vals, dd.Series):
return vals.dt.year
return pd.DatetimeIndex(vals).year.values
return year
[docs]class IsWeekend(TransformPrimitive):
"""Determines if a date falls on a weekend.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2019, 6, 17, 11, 10, 50),
... datetime(2019, 11, 30, 19, 45, 15)]
>>> is_weekend = IsWeekend()
>>> is_weekend(dates).tolist()
[False, False, True]
"""
name = "is_weekend"
input_types = [Datetime]
return_type = Boolean
dask_compatible = True
def get_function(self):
def is_weekend(vals):
if isinstance(vals, dd.Series):
return vals.dt.weekday > 4
return pd.DatetimeIndex(vals).weekday.values > 4
return is_weekend
[docs]class Weekday(TransformPrimitive):
"""Determines the day of the week from a datetime.
Description:
Returns the day of the week from a datetime value. Weeks
start on Monday (day 0) and run through Sunday (day 6).
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2019, 6, 17, 11, 10, 50),
... datetime(2019, 11, 30, 19, 45, 15)]
>>> weekday = Weekday()
>>> weekday(dates).tolist()
[4, 0, 5]
"""
name = "weekday"
input_types = [Datetime]
return_type = Ordinal
dask_compatible = True
def get_function(self):
def weekday(vals):
if isinstance(vals, dd.Series):
return vals.dt.weekday
return pd.DatetimeIndex(vals).weekday.values
return weekday
[docs]class NumCharacters(TransformPrimitive):
"""Calculates the number of characters in a string.
Examples:
>>> num_characters = NumCharacters()
>>> num_characters(['This is a string',
... 'second item',
... 'final1']).tolist()
[16, 11, 6]
"""
name = 'num_characters'
input_types = [Text]
return_type = Numeric
dask_compatible = True
def get_function(self):
def character_counter(array):
return array.fillna('').str.len()
return character_counter
[docs]class NumWords(TransformPrimitive):
"""Determines the number of words in a string by counting the spaces.
Examples:
>>> num_words = NumWords()
>>> num_words(['This is a string',
... 'Two words',
... 'no-spaces',
... 'Also works with sentences. Second sentence!']).tolist()
[4, 2, 1, 6]
"""
name = 'num_words'
input_types = [Text]
return_type = Numeric
dask_compatible = True
def get_function(self):
def word_counter(array):
return array.fillna('').str.count(' ') + 1
return word_counter
[docs]class TimeSince(TransformPrimitive):
"""Calculates time from a value to a specified cutoff datetime.
Args:
unit (str): Defines the unit of time to count from.
Defaults to Seconds. Acceptable values:
years, months, days, hours, minutes, seconds, milliseconds, nanoseconds
Examples:
>>> from datetime import datetime
>>> time_since = TimeSince()
>>> times = [datetime(2019, 3, 1, 0, 0, 0, 1),
... datetime(2019, 3, 1, 0, 0, 1, 0),
... datetime(2019, 3, 1, 0, 2, 0, 0)]
>>> cutoff_time = datetime(2019, 3, 1, 0, 0, 0, 0)
>>> values = time_since(array=times, time=cutoff_time)
>>> list(map(int, values))
[0, -1, -120]
Change output to nanoseconds
>>> from datetime import datetime
>>> time_since_nano = TimeSince(unit='nanoseconds')
>>> times = [datetime(2019, 3, 1, 0, 0, 0, 1),
... datetime(2019, 3, 1, 0, 0, 1, 0),
... datetime(2019, 3, 1, 0, 2, 0, 0)]
>>> cutoff_time = datetime(2019, 3, 1, 0, 0, 0, 0)
>>> values = time_since_nano(array=times, time=cutoff_time)
>>> list(map(lambda x: int(round(x)), values))
[-1000, -1000000000, -120000000000]
"""
name = 'time_since'
input_types = [[DatetimeTimeIndex], [Datetime]]
return_type = Numeric
uses_calc_time = True
dask_compatible = True
[docs] def __init__(self, unit="seconds"):
self.unit = unit.lower()
def get_function(self):
def pd_time_since(array, time):
if isinstance(array, list):
array = pd.Series(array)
return convert_time_units((time - array).dt.total_seconds(), self.unit)
return pd_time_since
[docs]class IsIn(TransformPrimitive):
"""Determines whether a value is present in a provided list.
Examples:
>>> items = ['string', 10.3, False]
>>> is_in = IsIn(list_of_outputs=items)
>>> is_in(['string', 10.5, False]).tolist()
[True, False, True]
"""
name = "isin"
input_types = [Variable]
return_type = Boolean
dask_compatible = True
[docs] def __init__(self, list_of_outputs=None):
self.list_of_outputs = list_of_outputs
def get_function(self):
def pd_is_in(array):
if isinstance(array, dd.Series):
return array.isin(self.list_of_outputs or [])
else:
return pd.Series(array).isin(self.list_of_outputs or [])
return pd_is_in
def generate_name(self, base_feature_names):
return u"%s.isin(%s)" % (base_feature_names[0],
str(self.list_of_outputs))
[docs]class Diff(TransformPrimitive):
"""Compute the difference between the value in a list and the
previous value in that list.
Description:
Given a list of values, compute the difference from the previous
item in the list. The result for the first element of the list will
always be `NaN`. If the values are datetimes, the output will be a
timedelta.
Examples:
>>> diff = Diff()
>>> values = [1, 10, 3, 4, 15]
>>> diff(values).tolist()
[nan, 9.0, -7.0, 1.0, 11.0]
"""
name = "diff"
input_types = [Numeric]
return_type = Numeric
uses_full_entity = True
def get_function(self):
def pd_diff(values):
return values.diff()
return pd_diff
class Negate(TransformPrimitive):
"""Negates a numeric value.
Examples:
>>> negate = Negate()
>>> negate([1.0, 23.2, -7.0]).tolist()
[-1.0, -23.2, 7.0]
"""
name = "negate"
input_types = [Numeric]
return_type = Numeric
dask_compatible = True
def get_function(self):
def negate(vals):
return vals * -1
return negate
def generate_name(self, base_feature_names):
return "-(%s)" % (base_feature_names[0])
[docs]class Not(TransformPrimitive):
"""Negates a boolean value.
Examples:
>>> not_func = Not()
>>> not_func([True, True, False]).tolist()
[False, False, True]
"""
name = "not"
input_types = [Boolean]
return_type = Boolean
dask_compatible = True
def generate_name(self, base_feature_names):
return u"NOT({})".format(base_feature_names[0])
def get_function(self):
return np.logical_not
[docs]class Percentile(TransformPrimitive):
"""Determines the percentile rank for each value in a list.
Examples:
>>> percentile = Percentile()
>>> percentile([10, 15, 1, 20]).tolist()
[0.5, 0.75, 0.25, 1.0]
Nan values are ignored when determining rank
>>> percentile([10, 15, 1, None, 20]).tolist()
[0.5, 0.75, 0.25, nan, 1.0]
"""
name = 'percentile'
uses_full_entity = True
input_types = [Numeric]
return_type = Numeric
def get_function(self):
return lambda array: pd.Series(array).rank(pct=True)
[docs]class Latitude(TransformPrimitive):
"""Returns the first tuple value in a list of LatLong tuples.
For use with the LatLong variable type.
Examples:
>>> latitude = Latitude()
>>> latitude([(42.4, -71.1),
... (40.0, -122.4),
... (41.2, -96.75)]).tolist()
[42.4, 40.0, 41.2]
"""
name = 'latitude'
input_types = [LatLong]
return_type = Numeric
def get_function(self):
def latitude(latlong):
if latlong.hasnans:
latlong = replace_latlong_nan(latlong)
return pd.Series(x[0] for x in latlong)
return latitude
[docs]class Longitude(TransformPrimitive):
"""Returns the second tuple value in a list of LatLong tuples.
For use with the LatLong variable type.
Examples:
>>> longitude = Longitude()
>>> longitude([(42.4, -71.1),
... (40.0, -122.4),
... (41.2, -96.75)]).tolist()
[-71.1, -122.4, -96.75]
"""
name = 'longitude'
input_types = [LatLong]
return_type = Numeric
def get_function(self):
def longitude(latlong):
if latlong.hasnans:
latlong = replace_latlong_nan(latlong)
return pd.Series(x[1] for x in latlong)
return longitude
[docs]class Haversine(TransformPrimitive):
"""Calculates the approximate haversine distance between two LatLong
variable types.
Args:
unit (str): Determines the unit value to output. Could
be `miles` or `kilometers`. Default is `miles`.
Examples:
>>> haversine = Haversine()
>>> distances = haversine([(42.4, -71.1), (40.0, -122.4)],
... [(40.0, -122.4), (41.2, -96.75)])
>>> np.round(distances, 3).tolist()
[2631.231, 1343.289]
Output units can be specified
>>> haversine_km = Haversine(unit='kilometers')
>>> distances_km = haversine_km([(42.4, -71.1), (40.0, -122.4)],
... [(40.0, -122.4), (41.2, -96.75)])
>>> np.round(distances_km, 3).tolist()
[4234.555, 2161.814]
"""
name = 'haversine'
input_types = [LatLong, LatLong]
return_type = Numeric
commutative = True
[docs] def __init__(self, unit='miles'):
valid_units = ['miles', 'kilometers']
if unit not in valid_units:
error_message = 'Invalid unit %s provided. Must be one of %s' % (unit, valid_units)
raise ValueError(error_message)
self.unit = unit
def get_function(self):
def haversine(latlong1, latlong2):
if latlong1.hasnans:
latlong1 = replace_latlong_nan(latlong1)
if latlong2.hasnans:
latlong2 = replace_latlong_nan(latlong2)
lat_1s = np.array([x[0] for x in latlong1])
lon_1s = np.array([x[1] for x in latlong1])
lat_2s = np.array([x[0] for x in latlong2])
lon_2s = np.array([x[1] for x in latlong2])
lon1, lat1, lon2, lat2 = map(
np.radians, [lon_1s, lat_1s, lon_2s, lat_2s])
dlon = lon2 - lon1
dlat = lat2 - lat1
a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * \
np.cos(lat2) * np.sin(dlon / 2.0)**2
radius_earth = 3958.7613
if self.unit == 'kilometers':
radius_earth = 6371.0088
distance = radius_earth * 2 * np.arcsin(np.sqrt(a))
return distance
return haversine
def generate_name(self, base_feature_names):
name = u"{}(".format(self.name.upper())
name += u", ".join(base_feature_names)
if self.unit != 'miles':
name += u", unit={}".format(self.unit)
name += u")"
return name
class Age(TransformPrimitive):
"""Calculates the age in years as a floating point number given a
date of birth.
Description:
Age in years is computed by calculating the number of days between
the date of birth and the reference time and dividing the result
by 365.
Examples:
Determine the age of three people as of Jan 1, 2019
>>> import pandas as pd
>>> reference_date = pd.to_datetime("01-01-2019")
>>> age = Age()
>>> input_ages = [pd.to_datetime("01-01-2000"),
... pd.to_datetime("05-30-1983"),
... pd.to_datetime("10-17-1997")]
>>> age(input_ages, time=reference_date).tolist()
[19.013698630136986, 35.61643835616438, 21.221917808219178]
"""
name = "age"
input_types = [DateOfBirth]
return_type = Numeric
uses_calc_time = True
def get_function(self):
def age(x, time=None):
return (time - x).dt.days / 365
return age