Source code for featuretools.utils.time_utils

from datetime import datetime, timedelta

import numpy as np
import pandas as pd


[docs]def make_temporal_cutoffs( instance_ids, cutoffs, window_size=None, num_windows=None, start=None, ): """Makes a set of equally spaced cutoff times prior to a set of input cutoffs and instance ids. If window_size and num_windows are provided, then num_windows of size window_size will be created prior to each cutoff time If window_size and a start list is provided, then a variable number of windows will be created prior to each cutoff time, with the corresponding start time as the first cutoff. If num_windows and a start list is provided, then num_windows of variable size will be created prior to each cutoff time, with the corresponding start time as the first cutoff Args: instance_ids (list, np.ndarray, or pd.Series): list of instance ids. This function will make a new datetime series of multiple cutoff times for each value in this array. cutoffs (list, np.ndarray, or pd.Series): list of datetime objects associated with each instance id. Each one of these will be the last time in the new datetime series for each instance id window_size (pd.Timedelta, optional): amount of time between each datetime in each new cutoff series num_windows (int, optional): number of windows in each new cutoff series start (list, optional): list of start times for each instance id """ if window_size is not None and num_windows is not None and start is not None: raise ValueError( "Only supply 2 of the 3 optional args, window_size, num_windows and start", ) out = [] for i, id_time in enumerate(zip(instance_ids, cutoffs)): _id, time = id_time _window_size = window_size _start = None if start is not None: if window_size is None: _window_size = (time - start[i]) / (num_windows - 1) else: _start = start[i] to_add = pd.DataFrame() to_add["time"] = pd.date_range( end=time, periods=num_windows, freq=_window_size, start=_start, ) to_add["instance_id"] = [_id] * len(to_add["time"]) out.append(to_add) return pd.concat(out).reset_index(drop=True)
def convert_time_units(secs, unit): """ Converts a time specified in seconds to a time in the given units Args: secs (integer): number of seconds. This function will convert the units of this number. unit(str): units to be converted to. acceptable values: years, months, days, hours, minutes, seconds, milliseconds, nanoseconds """ unit_divs = { "years": 31540000, "months": 2628000, "days": 86400, "hours": 3600, "minutes": 60, "seconds": 1, "milliseconds": 0.001, "nanoseconds": 0.000000001, } if unit not in unit_divs: raise ValueError("Invalid unit given, make sure it is plural") return secs / (unit_divs[unit]) def convert_datetime_to_floats(x): first = int(x.iloc[0].value * 1e-9) x = pd.to_numeric(x).astype(np.float64).values dividend = find_dividend_by_unit(first) x *= 1e-9 / dividend return x def convert_timedelta_to_floats(x): first = int(x.iloc[0].total_seconds()) dividend = find_dividend_by_unit(first) x = pd.TimedeltaIndex(x).total_seconds().astype(np.float64) / dividend return x def find_dividend_by_unit(time): """Finds whether time best corresponds to a value in days, hours, minutes, or seconds. """ for dividend in [86400, 3600, 60]: div = time / dividend if round(div) == div: return dividend return 1 def calculate_trend(series): # numpy can't handle `Int64` values, so cast to float if series.dtype == "Int64": series = series.astype("float64") df = pd.DataFrame({"x": series.index, "y": series.values}).dropna() if df.shape[0] <= 2: return np.nan if isinstance(df["x"].iloc[0], (datetime, pd.Timestamp)): x = convert_datetime_to_floats(df["x"]) else: x = df["x"].values if isinstance(df["y"].iloc[0], (datetime, pd.Timestamp)): y = convert_datetime_to_floats(df["y"]) elif isinstance(df["y"].iloc[0], (timedelta, pd.Timedelta)): y = convert_timedelta_to_floats(df["y"]) else: y = df["y"].values x = x - x.mean() y = y - y.mean() # prevent divide by zero error if len(np.unique(x)) == 1: return 0 # consider scipy.stats.linregress for large n cases coefficients = np.polyfit(x, y, 1) return coefficients[0]