Source code for featuretools.entityset.entityset

import copy
import logging
from collections import defaultdict

import dask.dataframe as dd
import numpy as np
import pandas as pd
from pandas.api.types import is_dtype_equal, is_numeric_dtype

import featuretools.variable_types.variable as vtypes
from featuretools.entityset import deserialize, serialize
from featuretools.entityset.entity import Entity
from featuretools.entityset.relationship import Relationship, RelationshipPath
from featuretools.utils.gen_utils import import_or_raise

pd.options.mode.chained_assignment = None  # default='warn'
logger = logging.getLogger('featuretools.entityset')


[docs]class EntitySet(object): """ Stores all actual data for a entityset Attributes: id entity_dict relationships time_type Properties: metadata """
[docs] def __init__(self, id=None, entities=None, relationships=None): """Creates EntitySet Args: id (str) : Unique identifier to associate with this instance entities (dict[str -> tuple(pd.DataFrame, str, str, dict[str -> Variable])]): dictionary of entities. Entries take the format {entity id -> (dataframe, id column, (time_index), (variable_types), (make_index))}. Note that time_index, variable_types and make_index are optional. relationships (list[(str, str, str, str)]): List of relationships between entities. List items are a tuple with the format (parent entity id, parent variable, child entity id, child variable). Example: .. code-block:: python entities = { "cards" : (card_df, "id"), "transactions" : (transactions_df, "id", "transaction_time") } relationships = [("cards", "id", "transactions", "card_id")] ft.EntitySet("my-entity-set", entities, relationships) """ self.id = id self.entity_dict = {} self.relationships = [] self.time_type = None entities = entities or {} relationships = relationships or [] for entity in entities: df = entities[entity][0] index_column = entities[entity][1] time_index = None variable_types = None make_index = None if len(entities[entity]) > 2: time_index = entities[entity][2] if len(entities[entity]) > 3: variable_types = entities[entity][3] if len(entities[entity]) > 4: make_index = entities[entity][4] self.entity_from_dataframe(entity_id=entity, dataframe=df, index=index_column, time_index=time_index, variable_types=variable_types, make_index=make_index) for relationship in relationships: parent_variable = self[relationship[0]][relationship[1]] child_variable = self[relationship[2]][relationship[3]] self.add_relationship(Relationship(parent_variable, child_variable)) self.reset_data_description()
def __sizeof__(self): return sum([entity.__sizeof__() for entity in self.entities]) def __dask_tokenize__(self): return (EntitySet, serialize.entityset_to_description(self.metadata)) def __eq__(self, other, deep=False): if len(self.entity_dict) != len(other.entity_dict): return False for eid, e in self.entity_dict.items(): if eid not in other.entity_dict: return False if not e.__eq__(other[eid], deep=deep): return False for r in other.relationships: if r not in other.relationships: return False return True def __ne__(self, other, deep=False): return not self.__eq__(other, deep=deep)
[docs] def __getitem__(self, entity_id): """Get entity instance from entityset Args: entity_id (str): Id of entity. Returns: :class:`.Entity` : Instance of entity. None if entity doesn't exist. """ if entity_id in self.entity_dict: return self.entity_dict[entity_id] name = self.id or "entity set" raise KeyError('Entity %s does not exist in %s' % (entity_id, name))
@property def entities(self): return list(self.entity_dict.values()) @property def metadata(self): '''Returns the metadata for this EntitySet. The metadata will be recomputed if it does not exist.''' if self._data_description is None: description = serialize.entityset_to_description(self) self._data_description = deserialize.description_to_entityset(description) return self._data_description def reset_data_description(self): self._data_description = None
[docs] def to_pickle(self, path, compression=None, profile_name=None): '''Write entityset in the pickle format, location specified by `path`. Path could be a local path or a S3 path. If writing to S3 a tar archive of files will be written. Args: path (str): location on disk to write to (will be created as a directory) compression (str) : Name of the compression to use. Possible values are: {'gzip', 'bz2', 'zip', 'xz', None}. profile_name (str) : Name of AWS profile to use, False to use an anonymous profile, or None. ''' serialize.write_data_description(self, path, format='pickle', compression=compression, profile_name=profile_name) return self
[docs] def to_parquet(self, path, engine='auto', compression=None, profile_name=None): '''Write entityset to disk in the parquet format, location specified by `path`. Path could be a local path or a S3 path. If writing to S3 a tar archive of files will be written. Args: path (str): location on disk to write to (will be created as a directory) engine (str) : Name of the engine to use. Possible values are: {'auto', 'pyarrow', 'fastparquet'}. compression (str) : Name of the compression to use. Possible values are: {'snappy', 'gzip', 'brotli', None}. profile_name (str) : Name of AWS profile to use, False to use an anonymous profile, or None. ''' serialize.write_data_description(self, path, format='parquet', engine=engine, compression=compression, profile_name=profile_name) return self
[docs] def to_csv(self, path, sep=',', encoding='utf-8', engine='python', compression=None, profile_name=None): '''Write entityset to disk in the csv format, location specified by `path`. Path could be a local path or a S3 path. If writing to S3 a tar archive of files will be written. Args: path (str) : Location on disk to write to (will be created as a directory) sep (str) : String of length 1. Field delimiter for the output file. encoding (str) : A string representing the encoding to use in the output file, defaults to 'utf-8'. engine (str) : Name of the engine to use. Possible values are: {'c', 'python'}. compression (str) : Name of the compression to use. Possible values are: {'gzip', 'bz2', 'zip', 'xz', None}. profile_name (str) : Name of AWS profile to use, False to use an anonymous profile, or None. ''' serialize.write_data_description(self, path, format='csv', index=False, sep=sep, encoding=encoding, engine=engine, compression=compression, profile_name=profile_name) return self
def to_dictionary(self): return serialize.entityset_to_description(self) ########################################################################### # Public getter/setter methods ######################################### ########################################################################### def __repr__(self): repr_out = u"Entityset: {}\n".format(self.id) repr_out += u" Entities:" for e in self.entities: if e.df.shape: repr_out += u"\n {} [Rows: {}, Columns: {}]".format( e.id, e.df.shape[0], e.df.shape[1]) else: repr_out += u"\n {} [Rows: None, Columns: None]".format( e.id) repr_out += "\n Relationships:" if len(self.relationships) == 0: repr_out += u"\n No relationships" for r in self.relationships: repr_out += u"\n %s.%s -> %s.%s" % \ (r._child_entity_id, r._child_variable_id, r._parent_entity_id, r._parent_variable_id) return repr_out def add_relationships(self, relationships): """Add multiple new relationships to a entityset Args: relationships (list[Relationship]) : List of new relationships. """ return [self.add_relationship(r) for r in relationships][-1]
[docs] def add_relationship(self, relationship): """Add a new relationship between entities in the entityset Args: relationship (Relationship) : Instance of new relationship to be added. """ if relationship in self.relationships: logger.warning( "Not adding duplicate relationship: %s", relationship) return self # _operations? # this is a new pair of entities child_e = relationship.child_entity child_v = relationship.child_variable.id parent_e = relationship.parent_entity parent_v = relationship.parent_variable.id if not isinstance(child_e[child_v], vtypes.Id): child_e.convert_variable_type(variable_id=child_v, new_type=vtypes.Id, convert_data=False) if not isinstance(parent_e[parent_v], vtypes.Index): parent_e.convert_variable_type(variable_id=parent_v, new_type=vtypes.Index, convert_data=False) # Empty dataframes (as a result of accessing Entity.metadata) # default to object dtypes for discrete variables, but # indexes/ids default to ints. In this case, we convert # the empty column's type to int if isinstance(child_e.df, pd.DataFrame) and \ (child_e.df.empty and child_e.df[child_v].dtype == object and is_numeric_dtype(parent_e.df[parent_v])): child_e.df[child_v] = pd.Series(name=child_v, dtype=np.int64) parent_dtype = parent_e.df[parent_v].dtype child_dtype = child_e.df[child_v].dtype msg = u"Unable to add relationship because {} in {} is Pandas dtype {}"\ u" and {} in {} is Pandas dtype {}." if not is_dtype_equal(parent_dtype, child_dtype): raise ValueError(msg.format(parent_v, parent_e.id, parent_dtype, child_v, child_e.id, child_dtype)) self.relationships.append(relationship) self.reset_data_description() return self
########################################################################### # Relationship access/helper methods ################################### ###########################################################################
[docs] def find_forward_paths(self, start_entity_id, goal_entity_id): """ Generator which yields all forward paths between a start and goal entity. Does not include paths which contain cycles. Args: start_entity_id (str) : id of entity to start the search from goal_entity_id (str) : if of entity to find forward path to See Also: :func:`BaseEntitySet.find_backward_paths` """ for sub_entity_id, path in self._forward_entity_paths(start_entity_id): if sub_entity_id == goal_entity_id: yield path
[docs] def find_backward_paths(self, start_entity_id, goal_entity_id): """ Generator which yields all backward paths between a start and goal entity. Does not include paths which contain cycles. Args: start_entity_id (str) : Id of entity to start the search from. goal_entity_id (str) : Id of entity to find backward path to. See Also: :func:`BaseEntitySet.find_forward_paths` """ for path in self.find_forward_paths(goal_entity_id, start_entity_id): # Reverse path yield path[::-1]
def _forward_entity_paths(self, start_entity_id, seen_entities=None): """ Generator which yields the ids of all entities connected through forward relationships, and the path taken to each. An entity will be yielded multiple times if there are multiple paths to it. Implemented using depth first search. """ if seen_entities is None: seen_entities = set() if start_entity_id in seen_entities: return seen_entities.add(start_entity_id) yield start_entity_id, [] for relationship in self.get_forward_relationships(start_entity_id): next_entity = relationship.parent_entity.id # Copy seen entities for each next node to allow multiple paths (but # not cycles). descendants = self._forward_entity_paths(next_entity, seen_entities.copy()) for sub_entity_id, sub_path in descendants: yield sub_entity_id, [relationship] + sub_path
[docs] def get_forward_entities(self, entity_id, deep=False): """ Get entities that are in a forward relationship with entity Args: entity_id (str): Id entity of entity to search from. deep (bool): if True, recursively find forward entities. Yields a tuple of (descendent_id, path from entity_id to descendant). """ for relationship in self.get_forward_relationships(entity_id): parent_eid = relationship.parent_entity.id direct_path = RelationshipPath([(True, relationship)]) yield parent_eid, direct_path if deep: sub_entities = self.get_forward_entities(parent_eid, deep=True) for sub_eid, path in sub_entities: yield sub_eid, direct_path + path
[docs] def get_backward_entities(self, entity_id, deep=False): """ Get entities that are in a backward relationship with entity Args: entity_id (str): Id entity of entity to search from. deep (bool): if True, recursively find backward entities. Yields a tuple of (descendent_id, path from entity_id to descendant). """ for relationship in self.get_backward_relationships(entity_id): child_eid = relationship.child_entity.id direct_path = RelationshipPath([(False, relationship)]) yield child_eid, direct_path if deep: sub_entities = self.get_backward_entities(child_eid, deep=True) for sub_eid, path in sub_entities: yield sub_eid, direct_path + path
def get_forward_relationships(self, entity_id): """Get relationships where entity "entity_id" is the child Args: entity_id (str): Id of entity to get relationships for. Returns: list[:class:`.Relationship`]: List of forward relationships. """ return [r for r in self.relationships if r.child_entity.id == entity_id] def get_backward_relationships(self, entity_id): """ get relationships where entity "entity_id" is the parent. Args: entity_id (str): Id of entity to get relationships for. Returns: list[:class:`.Relationship`]: list of backward relationships """ return [r for r in self.relationships if r.parent_entity.id == entity_id] def has_unique_forward_path(self, start_entity_id, end_entity_id): """ Is the forward path from start to end unique? This will raise if there is no such path. """ paths = self.find_forward_paths(start_entity_id, end_entity_id) next(paths) second_path = next(paths, None) return not second_path ########################################################################### # Entity creation methods ############################################## ###########################################################################
[docs] def entity_from_dataframe(self, entity_id, dataframe, index=None, variable_types=None, make_index=False, time_index=None, secondary_time_index=None, already_sorted=False): """ Load the data for a specified entity from a Pandas DataFrame. Args: entity_id (str) : Unique id to associate with this entity. dataframe (pandas.DataFrame) : Dataframe containing the data. index (str, optional): Name of the variable used to index the entity. If None, take the first column. variable_types (dict[str -> Variable/str], optional): Keys are of variable ids and values are variable types or type_strings. Used to to initialize an entity's store. make_index (bool, optional) : If True, assume index does not exist as a column in dataframe, and create a new column of that name using integers. Otherwise, assume index exists. time_index (str, optional): Name of the variable containing time data. Type must be in :class:`variables.DateTime` or be able to be cast to datetime (e.g. str, float, or numeric.) secondary_time_index (dict[str -> Variable]): Name of variable containing time data to use a second time index for the entity. already_sorted (bool, optional) : If True, assumes that input dataframe is already sorted by time. Defaults to False. Notes: Will infer variable types from Pandas dtype Example: .. ipython:: python import featuretools as ft import pandas as pd transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], "session_id": [1, 2, 1, 3, 4, 5], "amount": [100.40, 20.63, 33.32, 13.12, 67.22, 1.00], "transaction_time": pd.date_range(start="10:00", periods=6, freq="10s"), "fraud": [True, False, True, False, True, True]}) es = ft.EntitySet("example") es.entity_from_dataframe(entity_id="transactions", index="id", time_index="transaction_time", dataframe=transactions_df) es["transactions"] es["transactions"].df """ variable_types = variable_types or {} if time_index is not None and time_index == index: raise ValueError("time_index and index cannot be the same value, %s" % (time_index)) if time_index is None: for variable, variable_type in variable_types.items(): if variable_type == vtypes.DatetimeTimeIndex: raise ValueError("DatetimeTimeIndex variable %s must be set using time_index parameter" % (variable)) if len(self.entities) > 0: if not isinstance(dataframe, type(self.entities[0].df)): raise ValueError("All entity dataframes must be of the same type. " "Cannot add entity of type {} to an entityset with existing entities " "of type {}".format(type(dataframe), type(self.entities[0].df))) entity = Entity( entity_id, dataframe, self, variable_types=variable_types, index=index, time_index=time_index, secondary_time_index=secondary_time_index, already_sorted=already_sorted, make_index=make_index) self.entity_dict[entity.id] = entity self.reset_data_description() return self
[docs] def normalize_entity(self, base_entity_id, new_entity_id, index, additional_variables=None, copy_variables=None, make_time_index=None, make_secondary_time_index=None, new_entity_time_index=None, new_entity_secondary_time_index=None): """Create a new entity and relationship from unique values of an existing variable. Args: base_entity_id (str) : Entity id from which to split. new_entity_id (str): Id of the new entity. index (str): Variable in old entity that will become index of new entity. Relationship will be created across this variable. additional_variables (list[str]): List of variable ids to remove from base_entity and move to new entity. copy_variables (list[str]): List of variable ids to copy from old entity and move to new entity. make_time_index (bool or str, optional): Create time index for new entity based on time index in base_entity, optionally specifying which variable in base_entity to use for time_index. If specified as True without a specific variable, uses the primary time index. Defaults to True if base entity has a time index. make_secondary_time_index (dict[str -> list[str]], optional): Create a secondary time index from key. Values of dictionary are the variables to associate with the secondary time index. Only one secondary time index is allowed. If None, only associate the time index. new_entity_time_index (str, optional): Rename new entity time index. new_entity_secondary_time_index (str, optional): Rename new entity secondary time index. """ base_entity = self.entity_dict[base_entity_id] additional_variables = additional_variables or [] copy_variables = copy_variables or [] # Check base entity to make sure time index is valid if base_entity.time_index is not None: t_index = base_entity[base_entity.time_index] if not isinstance(t_index, (vtypes.NumericTimeIndex, vtypes.DatetimeTimeIndex)): base_error = "Time index '{0}' is not a NumericTimeIndex or DatetimeTimeIndex, but type {1}. Use set_time_index on entity '{2}' to set the time_index." raise TypeError(base_error.format(base_entity.time_index, type(t_index), str(base_entity.id))) if not isinstance(additional_variables, list): raise TypeError("'additional_variables' must be a list, but received type {}" .format(type(additional_variables))) if len(additional_variables) != len(set(additional_variables)): raise ValueError("'additional_variables' contains duplicate variables. All variables must be unique.") if not isinstance(copy_variables, list): raise TypeError("'copy_variables' must be a list, but received type {}" .format(type(copy_variables))) if len(copy_variables) != len(set(copy_variables)): raise ValueError("'copy_variables' contains duplicate variables. All variables must be unique.") for v in additional_variables + copy_variables: if v == index: raise ValueError("Not copying {} as both index and variable".format(v)) for v in additional_variables: if v == base_entity.time_index: raise ValueError("Not moving {} as it is the base time index variable. Perhaps, move the variable to the copy_variables.".format(v)) if isinstance(make_time_index, str): if make_time_index not in base_entity.df.columns: raise ValueError("'make_time_index' must be a variable in the base entity") elif make_time_index not in additional_variables + copy_variables: raise ValueError("'make_time_index' must be specified in 'additional_variables' or 'copy_variables'") if index == base_entity.index: raise ValueError("'index' must be different from the index column of the base entity") transfer_types = {} transfer_types[index] = type(base_entity[index]) for v in additional_variables + copy_variables: if type(base_entity[v]) == vtypes.DatetimeTimeIndex: transfer_types[v] = vtypes.Datetime elif type(base_entity[v]) == vtypes.NumericTimeIndex: transfer_types[v] = vtypes.Numeric else: transfer_types[v] = type(base_entity[v]) # create and add new entity new_entity_df = self[base_entity_id].df.copy() if make_time_index is None and base_entity.time_index is not None: make_time_index = True if isinstance(make_time_index, str): # Set the new time index to make_time_index. base_time_index = make_time_index new_entity_time_index = make_time_index already_sorted = (new_entity_time_index == base_entity.time_index) elif make_time_index: # Create a new time index based on the base entity time index. base_time_index = base_entity.time_index if new_entity_time_index is None: new_entity_time_index = "first_%s_time" % (base_entity.id) already_sorted = True assert base_entity.time_index is not None, \ "Base entity doesn't have time_index defined" if base_time_index not in [v for v in additional_variables]: copy_variables.append(base_time_index) transfer_types[new_entity_time_index] = type(base_entity[base_entity.time_index]) else: new_entity_time_index = None already_sorted = False if new_entity_time_index is not None and new_entity_time_index == index: raise ValueError("time_index and index cannot be the same value, %s" % (new_entity_time_index)) selected_variables = [index] +\ [v for v in additional_variables] +\ [v for v in copy_variables] new_entity_df2 = new_entity_df. \ drop_duplicates(index, keep='first')[selected_variables] if make_time_index: new_entity_df2 = new_entity_df2.rename(columns={base_time_index: new_entity_time_index}) if make_secondary_time_index: assert len(make_secondary_time_index) == 1, "Can only provide 1 secondary time index" secondary_time_index = list(make_secondary_time_index.keys())[0] secondary_variables = [index, secondary_time_index] + list(make_secondary_time_index.values())[0] secondary_df = new_entity_df. \ drop_duplicates(index, keep='last')[secondary_variables] if new_entity_secondary_time_index: secondary_df = secondary_df.rename(columns={secondary_time_index: new_entity_secondary_time_index}) secondary_time_index = new_entity_secondary_time_index else: new_entity_secondary_time_index = secondary_time_index secondary_df = secondary_df.set_index(index) new_entity_df = new_entity_df2.join(secondary_df, on=index) else: new_entity_df = new_entity_df2 base_entity_index = index transfer_types[index] = vtypes.Categorical if make_secondary_time_index: old_ti_name = list(make_secondary_time_index.keys())[0] ti_cols = list(make_secondary_time_index.values())[0] ti_cols = [c if c != old_ti_name else secondary_time_index for c in ti_cols] make_secondary_time_index = {secondary_time_index: ti_cols} self.entity_from_dataframe( new_entity_id, new_entity_df, index, already_sorted=already_sorted, time_index=new_entity_time_index, secondary_time_index=make_secondary_time_index, variable_types=transfer_types) self.entity_dict[base_entity_id].delete_variables(additional_variables) new_entity = self.entity_dict[new_entity_id] base_entity.convert_variable_type(base_entity_index, vtypes.Id, convert_data=False) self.add_relationship(Relationship(new_entity[index], base_entity[base_entity_index])) self.reset_data_description() return self
########################################################################### # Data wrangling methods ############################################### ########################################################################### def concat(self, other, inplace=False): '''Combine entityset with another to create a new entityset with the combined data of both entitysets. ''' assert_string = "Entitysets must have the same entities, relationships"\ ", and variable_ids" assert (self.__eq__(other) and self.relationships == other.relationships), assert_string for entity in self.entities: assert entity.id in other.entity_dict, assert_string assert (len(self[entity.id].variables) == len(other[entity.id].variables)), assert_string other_variable_ids = [o_variable.id for o_variable in other[entity.id].variables] assert (all([variable.id in other_variable_ids for variable in self[entity.id].variables])), assert_string if inplace: combined_es = self else: combined_es = copy.deepcopy(self) has_last_time_index = [] for entity in self.entities: self_df = entity.df other_df = other[entity.id].df combined_df = pd.concat([self_df, other_df]) if entity.created_index == entity.index: columns = [col for col in combined_df.columns if col != entity.index or col != entity.time_index] else: columns = [entity.index] combined_df.drop_duplicates(columns, inplace=True) if entity.time_index: combined_df.sort_values([entity.time_index, entity.index], inplace=True) else: combined_df.sort_index(inplace=True) if (entity.last_time_index is not None or other[entity.id].last_time_index is not None): has_last_time_index.append(entity.id) combined_es[entity.id].update_data(df=combined_df, recalculate_last_time_indexes=False) combined_es.add_last_time_indexes(updated_entities=has_last_time_index) self.reset_data_description() return combined_es ########################################################################### # Indexing methods ############################################### ########################################################################### def add_last_time_indexes(self, updated_entities=None): """ Calculates the last time index values for each entity (the last time an instance or children of that instance were observed). Used when calculating features using training windows Args: updated_entities (list[str]): List of entity ids to update last_time_index for (will update all parents of those entities as well) """ # Generate graph of entities to find leaf entities children = defaultdict(list) # parent --> child mapping child_vars = defaultdict(dict) for r in self.relationships: children[r.parent_entity.id].append(r.child_entity) child_vars[r.parent_entity.id][r.child_entity.id] = r.child_variable updated_entities = updated_entities or [] if updated_entities: # find parents of updated_entities parent_queue = updated_entities[:] parents = set() while len(parent_queue): e = parent_queue.pop(0) if e in parents: continue parents.add(e) for parent_id, _ in self.get_forward_entities(e): parent_queue.append(parent_id) queue = [self[p] for p in parents] to_explore = parents else: to_explore = set([e.id for e in self.entities[:]]) queue = self.entities[:] explored = set() for e in queue: e.last_time_index = None # We will explore children of entities on the queue, # which may not be in the to_explore set. Therefore, # we check whether all elements of to_explore are in # explored, rather than just comparing length while not to_explore.issubset(explored): entity = queue.pop(0) if entity.last_time_index is None: if entity.time_index is not None: lti = entity.df[entity.time_index].copy() if isinstance(entity.df, dd.DataFrame): # The current Dask implementation doesn't set the index of the dataframe # to the entity's index, so we have to do it manually here lti.index = entity.df[entity.index].copy() else: lti = entity.df[entity.index].copy() if isinstance(entity.df, dd.DataFrame): lti.index = entity.df[entity.index].copy() lti = lti.apply(lambda x: None) else: lti[:] = None entity.last_time_index = lti if entity.id in children: child_entities = children[entity.id] # if all children not explored, skip for now if not set([e.id for e in child_entities]).issubset(explored): # Now there is a possibility that a child entity # was not explicitly provided in updated_entities, # and never made it onto the queue. If updated_entities # is None then we just load all entities onto the queue # so we didn't need this logic for e in child_entities: if e.id not in explored and e.id not in [q.id for q in queue]: queue.append(e) queue.append(entity) continue # updated last time from all children for child_e in child_entities: if child_e.last_time_index is None: continue link_var = child_vars[entity.id][child_e.id].id if isinstance(child_e.last_time_index, dd.Series): to_join = child_e.df[link_var] to_join.index = child_e.df[child_e.index] lti_df = child_e.last_time_index.to_frame(name='last_time').join( to_join.to_frame(name=entity.index) ) new_index = lti_df.index.copy() new_index.name = None lti_df.index = new_index lti_df = lti_df.groupby(lti_df[entity.index]).agg('max') lti_df = entity.last_time_index.to_frame(name='last_time_old').join(lti_df) else: lti_df = pd.DataFrame({'last_time': child_e.last_time_index, entity.index: child_e.df[link_var]}) # sort by time and keep only the most recent lti_df.sort_values(['last_time', entity.index], kind="mergesort", inplace=True) lti_df.drop_duplicates(entity.index, keep='last', inplace=True) lti_df.set_index(entity.index, inplace=True) lti_df = lti_df.reindex(entity.last_time_index.index) lti_df['last_time_old'] = entity.last_time_index if not isinstance(lti_df, dd.DataFrame) and lti_df.empty: # Pandas errors out if it tries to do fillna and then max on an empty dataframe lti_df = pd.Series() else: lti_df['last_time'] = lti_df['last_time'].astype('datetime64[ns]') lti_df['last_time_old'] = lti_df['last_time_old'].astype('datetime64[ns]') lti_df = lti_df.fillna(pd.to_datetime('1800-01-01 00:00')).max(axis=1) lti_df = lti_df.replace(pd.to_datetime('1800-01-01 00:00'), pd.NaT) # lti_df = lti_df.apply(lambda x: x.dropna().max(), axis=1) entity.last_time_index = lti_df entity.last_time_index.name = 'last_time' explored.add(entity.id) self.reset_data_description() ########################################################################### # Other ############################################### ###########################################################################
[docs] def add_interesting_values(self, max_values=5, verbose=False): """Find interesting values for categorical variables, to be used to generate "where" clauses Args: max_values (int) : Maximum number of values per variable to add. verbose (bool) : If True, print summary of interesting values found. Returns: None """ for entity in self.entities: entity.add_interesting_values(max_values=max_values, verbose=verbose) self.reset_data_description()
[docs] def plot(self, to_file=None): """ Create a UML diagram-ish graph of the EntitySet. Args: to_file (str, optional) : Path to where the plot should be saved. If set to None (as by default), the plot will not be saved. Returns: graphviz.Digraph : Graph object that can directly be displayed in Jupyter notebooks. """ GRAPHVIZ_ERR_MSG = ('Please install graphviz to plot entity sets.' + ' (See https://docs.featuretools.com/en/stable/getting_started/install.html#installing-graphviz for' + ' details)') graphviz = import_or_raise("graphviz", GRAPHVIZ_ERR_MSG) # Try rendering a dummy graph to see if a working backend is installed try: graphviz.Digraph().pipe() except graphviz.backend.ExecutableNotFound: raise RuntimeError( "To plot entity sets, a graphviz backend is required.\n" + "Install the backend using one of the following commands:\n" + " Mac OS: brew install graphviz\n" + " Linux (Ubuntu): sudo apt-get install graphviz\n" + " Windows: conda install python-graphviz\n" + " For more details visit: https://docs.featuretools.com/en/stable/getting_started/install.html" ) if to_file: # Explicitly cast to str in case a Path object was passed in to_file = str(to_file) split_path = to_file.split('.') if len(split_path) < 2: raise ValueError("Please use a file extension like '.pdf'" + " so that the format can be inferred") format = split_path[-1] valid_formats = graphviz.backend.FORMATS if format not in valid_formats: raise ValueError("Unknown format. Make sure your format is" + " amongst the following: %s" % valid_formats) else: format = None # Initialize a new directed graph graph = graphviz.Digraph(self.id, format=format, graph_attr={'splines': 'ortho'}) # Draw entities for entity in self.entities: variables_string = '\l'.join([var.id + ' : ' + var.type_string # noqa: W605 for var in entity.variables]) nrows = entity.shape[0] label = '{%s (%d row%s)|%s\l}' % (entity.id, nrows, 's' * (nrows > 1), variables_string) # noqa: W605 graph.node(entity.id, shape='record', label=label) # Draw relationships for rel in self.relationships: # Display the key only once if is the same for both related entities if rel._parent_variable_id == rel._child_variable_id: label = rel._parent_variable_id else: label = '%s -> %s' % (rel._parent_variable_id, rel._child_variable_id) graph.edge(rel._child_entity_id, rel._parent_entity_id, xlabel=label) if to_file: # Graphviz always appends the format to the file name, so we need to # remove it manually to avoid file names like 'file_name.pdf.pdf' offset = len(format) + 1 # Add 1 for the dot output_path = to_file[:-offset] graph.render(output_path, cleanup=True) return graph