import numpy as np
import pandas as pd
from featuretools.utils.gen_utils import camel_to_snake, find_descendents
class ClassNameDescriptor(object):
"""Descriptor to convert a class's name from camelcase to snakecase
"""
def __get__(self, instance, class_):
return camel_to_snake(class_.__name__)
class Variable(object):
"""Represent a variable in an entity
A Variable is analogous to a column in table in a relational database
Args:
id (str) : Id of variable. Must match underlying data in Entity
it belongs to.
entity (:class:`.Entity`) : Entity this variable belongs to.
name (str, optional) : Variable name. Defaults to id.
See Also:
:class:`.Entity`, :class:`.Relationship`, :class:`.BaseEntitySet`
"""
type_string = ClassNameDescriptor()
_default_pandas_dtype = object
def __init__(self, id, entity, name=None):
assert isinstance(id, str), "Variable id must be a string"
self.id = id
self._name = name
self.entity_id = entity.id
assert entity.entityset is not None, "Entity must contain reference to EntitySet"
self.entity = entity
if self.id not in self.entity.df:
default_dtype = self._default_pandas_dtype
if default_dtype == np.datetime64:
default_dtype = 'datetime64[ns]'
if default_dtype == np.timedelta64:
default_dtype = 'timedelta64[ns]'
else:
default_dtype = self.entity.df[self.id].dtype
self._interesting_values = pd.Series(dtype=default_dtype)
@property
def entityset(self):
return self.entity.entityset
def __eq__(self, other, deep=False):
shallow_eq = isinstance(other, self.__class__) and \
self.id == other.id and \
self.entity_id == other.entity_id
if not deep:
return shallow_eq
else:
return shallow_eq and set(self.interesting_values.values) == set(other.interesting_values.values)
def __hash__(self):
return hash((self.id, self.entity_id))
def __repr__(self):
return u"<Variable: {} (dtype = {})>".format(self.name, self.type_string)
@classmethod
def create_from(cls, variable):
"""Create new variable this type from existing
Args:
variable (Variable) : Existing variable to create from.
Returns:
:class:`.Variable` : new variable
"""
v = cls(id=variable.id, name=variable.name, entity=variable.entity)
return v
@property
def name(self):
return self._name if self._name is not None else self.id
@property
def dtype(self):
return self.type_string \
if self.type_string is not None else "generic_type"
@name.setter
def name(self, name):
self._name = name
@property
def interesting_values(self):
return self._interesting_values
@interesting_values.setter
def interesting_values(self, interesting_values):
self._interesting_values = pd.Series(interesting_values,
dtype=self._interesting_values.dtype)
@property
def series(self):
return self.entity.df[self.id]
def to_data_description(self):
return {
'id': self.id,
'type': {
'value': self.type_string,
},
'properties': {
'name': self.name,
'entity': self.entity.id,
'interesting_values': self._interesting_values.to_json()
},
}
class Unknown(Variable):
pass
class Discrete(Variable):
"""Superclass representing variables that take on discrete values"""
def __init__(self, id, entity, name=None):
super(Discrete, self).__init__(id, entity, name)
@property
def interesting_values(self):
return self._interesting_values
@interesting_values.setter
def interesting_values(self, values):
seen = set()
seen_add = seen.add
self._interesting_values = pd.Series([v for v in values if not
(v in seen or seen_add(v))],
dtype=self._interesting_values.dtype)
[docs]class Boolean(Variable):
"""Represents variables that take on one of two values
Args:
true_values (list) : List of valued true values. Defaults to [1, True, "true", "True", "yes", "t", "T"]
false_values (list): List of valued false values. Defaults to [0, False, "false", "False", "no", "f", "F"]
"""
_default_pandas_dtype = bool
[docs] def __init__(self,
id,
entity,
name=None,
true_values=None,
false_values=None):
default = [1, True, "true", "True", "yes", "t", "T"]
self.true_values = true_values or default
default = [0, False, "false", "False", "no", "f", "F"]
self.false_values = false_values or default
super(Boolean, self).__init__(id, entity, name=name)
def to_data_description(self):
description = super(Boolean, self).to_data_description()
description['type'].update({
'true_values': self.true_values,
'false_values': self.false_values
})
return description
[docs]class Categorical(Discrete):
"""Represents variables that can take an unordered discrete values
Args:
categories (list) : List of categories. If left blank, inferred from data.
"""
[docs] def __init__(self, id, entity, name=None, categories=None):
self.categories = None or []
super(Categorical, self).__init__(id, entity, name=name)
def to_data_description(self):
description = super(Categorical, self).to_data_description()
description['type'].update({'categories': self.categories})
return description
[docs]class Id(Categorical):
"""Represents variables that identify another entity"""
_default_pandas_dtype = int
[docs]class Ordinal(Discrete):
"""Represents variables that take on an ordered discrete value"""
_default_pandas_dtype = int
[docs]class Numeric(Variable):
"""Represents variables that contain numeric values
Args:
range (list, optional) : List of start and end. Can use inf and -inf to represent infinity. Unconstrained if not specified.
start_inclusive (bool, optional) : Whether or not range includes the start value.
end_inclusive (bool, optional) : Whether or not range includes the end value
Attributes:
max (float)
min (float)
std (float)
mean (float)
"""
_default_pandas_dtype = float
[docs] def __init__(self,
id,
entity,
name=None,
range=None,
start_inclusive=True,
end_inclusive=False):
self.range = None or []
self.start_inclusive = start_inclusive
self.end_inclusive = end_inclusive
super(Numeric, self).__init__(id, entity, name=name)
def to_data_description(self):
description = super(Numeric, self).to_data_description()
description['type'].update({
'range': self.range,
'start_inclusive': self.start_inclusive,
'end_inclusive': self.end_inclusive,
})
return description
[docs]class Index(Variable):
"""Represents variables that uniquely identify an instance of an entity
Attributes:
count (int)
"""
_default_pandas_dtype = int
[docs]class Datetime(Variable):
"""Represents variables that are points in time
Args:
format (str): Python datetime format string documented `here <http://strftime.org/>`_.
"""
_default_pandas_dtype = np.datetime64
[docs] def __init__(self, id, entity, name=None, format=None):
self.format = format
super(Datetime, self).__init__(id, entity, name=name)
def __repr__(self):
return u"<Variable: {} (dtype: {}, format: {})>".format(self.name, self.type_string, self.format)
def to_data_description(self):
description = super(Datetime, self).to_data_description()
description['type'].update({'format': self.format})
return description
[docs]class TimeIndex(Variable):
"""Represents time index of entity"""
_default_pandas_dtype = np.datetime64
[docs]class NumericTimeIndex(TimeIndex, Numeric):
"""Represents time index of entity that is numeric"""
_default_pandas_dtype = float
[docs]class DatetimeTimeIndex(TimeIndex, Datetime):
"""Represents time index of entity that is a datetime"""
_default_pandas_dtype = np.datetime64
class Timedelta(Variable):
"""Represents variables that are timedeltas
Args:
range (list, optional) : List of start and end of allowed range in seconds. Can use inf and -inf to represent infinity. Unconstrained if not specified.
start_inclusive (bool, optional) : Whether or not range includes the start value.
end_inclusive (bool, optional) : Whether or not range includes the end value
"""
_default_pandas_dtype = np.timedelta64
def __init__(self,
id,
entity,
name=None,
range=None,
start_inclusive=True,
end_inclusive=False):
self.range = range or []
self.start_inclusive = start_inclusive
self.end_inclusive = end_inclusive
super(Timedelta, self).__init__(id, entity, name=name)
def to_data_description(self):
description = super(Timedelta, self).to_data_description()
description['type'].update({
'range': self.range,
'start_inclusive': self.start_inclusive,
'end_inclusive': self.end_inclusive,
})
return description
[docs]class Text(Variable):
"""Represents variables that are arbitary strings"""
_default_pandas_dtype = str
class PandasTypes(object):
_all = 'all'
_categorical = 'category'
_pandas_datetimes = ['datetime64[ns]', 'datetime64[ns, tz]']
_pandas_timedeltas = ['Timedelta']
_pandas_numerics = ['int16', 'int32', 'int64',
'float16', 'float32', 'float64']
[docs]class LatLong(Variable):
"""Represents an ordered pair (Latitude, Longitude)
To make a latlong in a dataframe do
data['latlong'] = data[['latitude', 'longitude']].apply(tuple, axis=1)
"""
[docs]class ZIPCode(Categorical):
"""Represents a postal address in the United States.
Consists of a series of digits which are casts as
string. Five digit and 9 digit zipcodes are supported.
"""
_default_pandas_dtype = str
[docs]class IPAddress(Variable):
"""Represents a computer network address. Represented
in dotted-decimal notation. IPv4 and IPv6 are supported.
"""
_default_pandas_dtype = str
[docs]class FullName(Variable):
"""Represents a person's full name. May consist of a
first name, last name, and a title.
"""
_default_pandas_dtype = str
[docs]class EmailAddress(Variable):
"""Represents an email box to which email message are sent.
Consists of a local-part, an @ symbol, and a domain.
"""
_default_pandas_dtype = str
[docs]class URL(Variable):
"""Represents a valid web url (with or without http/www)"""
_default_pandas_dtype = str
[docs]class PhoneNumber(Variable):
"""Represents any valid phone number.
Can be with/without parenthesis.
Can be with/without area/country codes.
"""
_default_pandas_dtype = str
[docs]class DateOfBirth(Datetime):
"""Represents a date of birth as a datetime"""
_default_pandas_dtype = np.datetime64
[docs]class CountryCode(Categorical):
"""Represents an ISO-3166 standard country code.
ISO 3166-1 (countries) are supported. These codes
should be in the Alpha-2 format.
e.g. United States of America = US
"""
_default_pandas_dtype = str
[docs]class SubRegionCode(Categorical):
"""Represents an ISO-3166 standard sub-region code.
ISO 3166-2 codes (sub-regions are supported. These codes
should be in the Alpha-2 format.
e.g. United States of America, Arizona = US-AZ
"""
_default_pandas_dtype = str
[docs]class FilePath(Variable):
"""Represents a valid filepath, absolute or relative"""
_default_pandas_dtype = str
def find_variable_types():
return {vtype.type_string: vtype for vtype in find_descendents(Variable)
if vtype != Variable}
DEFAULT_DTYPE_VALUES = {
np.datetime64: pd.Timestamp.now(),
int: 0,
float: 0.1,
np.timedelta64: pd.Timedelta('1d'),
object: 'object',
bool: True,
str: 'test'
}