Source code for featuretools.demo.flight

import math
import re

import pandas as pd
from tqdm import tqdm

import featuretools as ft
import featuretools.variable_types as vtypes


[docs]def load_flight(month_filter=None,
                categorical_filter=None,
                nrows=None,
                demo=True,
                return_single_table=False,
                verbose=False):
    '''
    Download, clean, and filter flight data from 2017.
    The original dataset can be found `here <https://www.transtats.bts.gov/DL_SelectFields.asp?DB_Short_Name=On-Time&Table_ID=236>`_.

    Args:

        month_filter (list[int]): Only use data from these months (example is ``[1, 2]``).
            To skip, set to None.
        categorical_filter (dict[str->str]): Use only specified categorical values.
            Example is ``{'dest_city': ['Boston, MA'], 'origin_city': ['Boston, MA']}``
            which returns all flights in OR out of Boston. To skip, set to None.
        nrows (int): Passed to nrows in ``pd.read_csv``. Used before filtering.
        demo (bool): Use only two months of data. If False, use the whole year.
        return_single_table (bool): Exit the function early and return a dataframe.
        verbose (bool): Show a progress bar while loading the data.

    Examples:

        .. ipython::
            :verbatim:

            In [1]: import featuretools as ft

            In [2]: es = ft.demo.load_flight(verbose=True,
               ...:                          month_filter=[1],
               ...:                          categorical_filter={'origin_city':['Boston, MA']})
            100%|xxxxxxxxxxxxxxxxxxxxxxxxx| 100/100 [01:16<00:00,  1.31it/s]

            In [3]: es
            Out[3]:
            Entityset: Flight Data
              Entities:
                airports [Rows: 55, Columns: 3]
                flights [Rows: 613, Columns: 9]
                trip_logs [Rows: 9456, Columns: 22]
                airlines [Rows: 10, Columns: 1]
              Relationships:
                trip_logs.flight_id -> flights.flight_id
                flights.carrier -> airlines.carrier
                flights.dest -> airports.dest
    '''

    filename, csv_length = get_flight_filename(demo=demo)

    print('Downloading data ...')
    url = "https://api.featurelabs.com/datasets/{}?version={}".format(filename, ft.__version__)

    chunksize = math.ceil(csv_length / 99)
    pd.options.display.max_columns = 200
    iter_csv = pd.read_csv(url,
                           compression='zip',
                           iterator=True,
                           nrows=nrows,
                           chunksize=chunksize)
    if verbose:
        iter_csv = tqdm(iter_csv, total=100)

    partial_df_list = []
    for chunk in iter_csv:
        df = filter_data(_clean_data(chunk),
                         month_filter=month_filter,
                         categorical_filter=categorical_filter)
        partial_df_list.append(df)
    data = pd.concat(partial_df_list)

    if return_single_table:
        return data

    es = make_es(data)

    return es


def make_es(data):
    es = ft.EntitySet('Flight Data')
    arr_time_columns = ['arr_delay', 'dep_delay', 'carrier_delay', 'weather_delay',
                        'national_airspace_delay', 'security_delay',
                        'late_aircraft_delay', 'canceled', 'diverted',
                        'taxi_in', 'taxi_out', 'air_time', 'dep_time']

    variable_types = {'flight_num': vtypes.Categorical,
                      'distance_group': vtypes.Ordinal,
                      'canceled': vtypes.Boolean,
                      'diverted': vtypes.Boolean}

    es.entity_from_dataframe('trip_logs',
                             data,
                             index='trip_log_id',
                             make_index=True,
                             time_index='date_scheduled',
                             secondary_time_index={'arr_time': arr_time_columns},
                             variable_types=variable_types)

    es.normalize_entity('trip_logs', 'flights', 'flight_id',
                        additional_variables=['origin', 'origin_city', 'origin_state',
                                              'dest', 'dest_city', 'dest_state',
                                              'distance_group', 'carrier', 'flight_num'])

    es.normalize_entity('flights', 'airlines', 'carrier',
                        make_time_index=False)

    es.normalize_entity('flights', 'airports', 'dest',
                        additional_variables=['dest_city', 'dest_state'],
                        make_time_index=False)
    return es


def _clean_data(data):

    # Make column names snake case
    clean_data = data.rename(
        columns={col: convert(col) for col in data})

    # Chance crs -> "scheduled" and other minor clarifications
    clean_data = clean_data.rename(columns={'crs_arr_time': 'scheduled_arr_time',
                                            'crs_dep_time': 'scheduled_dep_time',
                                            'crs_elapsed_time': 'scheduled_elapsed_time',
                                            'nas_delay': 'national_airspace_delay',
                                            'origin_city_name': 'origin_city',
                                            'dest_city_name': 'dest_city',
                                            'cancelled': 'canceled'})

    # Combine strings like 0130 (1:30 AM) with dates (2017-01-01)
    clean_data['scheduled_dep_time'] = clean_data['scheduled_dep_time'].apply(lambda x: str(x)) + clean_data['flight_date'].astype('str')

    # Parse combined string as a date
    clean_data.loc[:, 'scheduled_dep_time'] = pd.to_datetime(
        clean_data['scheduled_dep_time'], format='%H%M%Y-%m-%d', errors='coerce')

    clean_data['scheduled_elapsed_time'] = pd.to_timedelta(clean_data['scheduled_elapsed_time'], unit='m')

    clean_data = _reconstruct_times(clean_data)

    # Create a time index 6 months before scheduled_dep
    clean_data.loc[:, 'date_scheduled'] = clean_data['scheduled_dep_time'].dt.date - \
        pd.Timedelta('120d')

    # A null entry for a delay means no delay
    clean_data = _fill_labels(clean_data)

    # Nulls for scheduled values are too problematic. Remove them.
    clean_data = clean_data.dropna(
        axis='rows', subset=['scheduled_dep_time', 'scheduled_arr_time'])

    # Make a flight id. Define a flight as a combination of:
    # 1. carrier 2. flight number 3. origin airport 4. dest airport
    clean_data.loc[:, 'flight_id'] = clean_data['carrier'] + '-' + \
        clean_data['flight_num'].apply(lambda x: str(x)) + ':' + clean_data['origin'] + '->' + clean_data['dest']

    column_order = [
        'flight_id',
        'flight_num',
        'date_scheduled',
        'scheduled_dep_time',
        'scheduled_arr_time',
        'carrier',
        'origin', 'origin_city', 'origin_state',
        'dest', 'dest_city', 'dest_state',
        'distance_group',
        'dep_time',
        'arr_time',
        'dep_delay', 'taxi_out', 'taxi_in', 'arr_delay',
        'diverted', 'scheduled_elapsed_time', 'air_time', 'distance',
        'carrier_delay', 'weather_delay',
        'national_airspace_delay', 'security_delay', 'late_aircraft_delay',
        'canceled'
    ]

    clean_data = clean_data[column_order]

    return clean_data


def _fill_labels(clean_data):
    labely_columns = ['arr_delay', 'dep_delay', 'carrier_delay', 'weather_delay',
                      'national_airspace_delay', 'security_delay',
                      'late_aircraft_delay', 'canceled', 'diverted',
                      'taxi_in', 'taxi_out', 'air_time']
    for col in labely_columns:
        clean_data.loc[:, col] = clean_data[col].fillna(0)

    return clean_data


def _reconstruct_times(clean_data):
    """ Reconstruct departure_time, scheduled_dep_time,
        arrival_time and scheduled_arr_time by adding known delays
        to known times. We do:
            - dep_time is scheduled_dep + dep_delay
            - arr_time is dep_time + taxiing and air_time
            - scheduled arrival is scheduled_dep + scheduled_elapsed
    """
    clean_data.loc[:, 'dep_time'] = clean_data['scheduled_dep_time'] + \
        pd.to_timedelta(clean_data['dep_delay'], unit='m')

    clean_data.loc[:, 'arr_time'] = clean_data['dep_time'] + \
        pd.to_timedelta(clean_data['taxi_out'] +
                        clean_data['air_time'] +
                        clean_data['taxi_in'], unit='m')

    clean_data.loc[:, 'scheduled_arr_time'] = clean_data['scheduled_dep_time'] + \
        clean_data['scheduled_elapsed_time']
    return clean_data


def filter_data(clean_data,
                month_filter=None,
                categorical_filter=None):
    if month_filter is not None:
        tmp = clean_data['scheduled_dep_time'].dt.month.isin(month_filter)
        clean_data = clean_data[tmp]

    if categorical_filter is not None:
        tmp = False
        for key, values in categorical_filter.items():
            tmp = tmp | clean_data[key].isin(values)
        clean_data = clean_data[tmp]

    return clean_data


def convert(name):
    # Rename columns to underscore
    # Code via SO https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()


def get_flight_filename(demo=True):
    if demo:
        filename = SMALL_FLIGHT_CSV
        rows = 860457
    else:
        filename = BIG_FLIGHT_CSV
        rows = 5162742

    return filename, rows


SMALL_FLIGHT_CSV = 'data_2017_jan_feb.csv.zip'
BIG_FLIGHT_CSV = 'data_all_2017.csv.zip'