123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475 |
- from .imports import *
- from sklearn_pandas import DataFrameMapper
- from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler
- from pandas.api.types import is_string_dtype, is_numeric_dtype
- from sklearn.ensemble import forest
- from sklearn.tree import export_graphviz
- def set_plot_sizes(sml, med, big):
- plt.rc('font', size=sml) # controls default text sizes
- plt.rc('axes', titlesize=sml) # fontsize of the axes title
- plt.rc('axes', labelsize=med) # fontsize of the x and y labels
- plt.rc('xtick', labelsize=sml) # fontsize of the tick labels
- plt.rc('ytick', labelsize=sml) # fontsize of the tick labels
- plt.rc('legend', fontsize=sml) # legend fontsize
- plt.rc('figure', titlesize=big) # fontsize of the figure title
- def parallel_trees(m, fn, n_jobs=8):
- return list(ProcessPoolExecutor(n_jobs).map(fn, m.estimators_))
- def draw_tree(t, df, size=10, ratio=0.6, precision=0):
- """ Draws a representation of a random forest in IPython.
- Parameters:
- -----------
- t: The tree you wish to draw
- df: The data used to train the tree. This is used to get the names of the features.
- """
- s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True,
- special_characters=True, rotate=True, precision=precision)
- IPython.display.display(graphviz.Source(re.sub('Tree {',
- f'Tree {{ size={size}; ratio={ratio}', s)))
- def combine_date(years, months=1, days=1, weeks=None, hours=None, minutes=None,
- seconds=None, milliseconds=None, microseconds=None, nanoseconds=None):
- years = np.asarray(years) - 1970
- months = np.asarray(months) - 1
- days = np.asarray(days) - 1
- types = ('<M8[Y]', '<m8[M]', '<m8[D]', '<m8[W]', '<m8[h]',
- '<m8[m]', '<m8[s]', '<m8[ms]', '<m8[us]', '<m8[ns]')
- vals = (years, months, days, weeks, hours, minutes, seconds,
- milliseconds, microseconds, nanoseconds)
- return sum(np.asarray(v, dtype=t) for t, v in zip(types, vals)
- if v is not None)
- def get_sample(df,n):
- """ Gets a random sample of n rows from df, without replacement.
- Parameters:
- -----------
- df: A pandas data frame, that you wish to sample from.
- n: The number of rows you wish to sample.
- Returns:
- --------
- return value: A random sample of n rows of df.
- Examples:
- ---------
- >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
- >>> df
- col1 col2
- 0 1 a
- 1 2 b
- 2 3 a
- >>> get_sample(df, 2)
- col1 col2
- 1 2 b
- 2 3 a
- """
- idxs = sorted(np.random.permutation(len(df))[:n])
- return df.iloc[idxs].copy()
- def add_datepart(df, fldname, drop=True, time=False):
- """add_datepart converts a column of df from a datetime64 to many columns containing
- the information from the date. This applies changes inplace.
- Parameters:
- -----------
- df: A pandas data frame. df gain several new columns.
- fldname: A string that is the name of the date column you wish to expand.
- If it is not a datetime64 series, it will be converted to one with pd.to_datetime.
- drop: If true then the original date column will be removed.
- time: If true time features: Hour, Minute, Second will be added.
- Examples:
- ---------
- >>> df = pd.DataFrame({ 'A' : pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'], infer_datetime_format=False) })
- >>> df
- A
- 0 2000-03-11
- 1 2000-03-12
- 2 2000-03-13
- >>> add_datepart(df, 'A')
- >>> df
- AYear AMonth AWeek ADay ADayofweek ADayofyear AIs_month_end AIs_month_start AIs_quarter_end AIs_quarter_start AIs_year_end AIs_year_start AElapsed
- 0 2000 3 10 11 5 71 False False False False False False 952732800
- 1 2000 3 10 12 6 72 False False False False False False 952819200
- 2 2000 3 11 13 0 73 False False False False False False 952905600
- """
- fld = df[fldname]
- fld_dtype = fld.dtype
- if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
- fld_dtype = np.datetime64
- if not np.issubdtype(fld_dtype, np.datetime64):
- df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
- targ_pre = re.sub('[Dd]ate$', '', fldname)
- attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
- 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
- if time: attr = attr + ['Hour', 'Minute', 'Second']
- for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
- df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
- if drop: df.drop(fldname, axis=1, inplace=True)
- def is_date(x): return np.issubdtype(x.dtype, np.datetime64)
- def train_cats(df):
- """Change any columns of strings in a panda's dataframe to a column of
- catagorical values. This applies the changes inplace.
- Parameters:
- -----------
- df: A pandas dataframe. Any columns of strings will be changed to
- categorical values.
- Examples:
- ---------
- >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
- >>> df
- col1 col2
- 0 1 a
- 1 2 b
- 2 3 a
- note the type of col2 is string
- >>> train_cats(df)
- >>> df
- col1 col2
- 0 1 a
- 1 2 b
- 2 3 a
- now the type of col2 is category
- """
- for n,c in df.items():
- if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()
- def apply_cats(df, trn):
- """Changes any columns of strings in df into categorical variables using trn as
- a template for the category codes.
- Parameters:
- -----------
- df: A pandas dataframe. Any columns of strings will be changed to
- categorical values. The category codes are determined by trn.
- trn: A pandas dataframe. When creating a category for df, it looks up the
- what the category's code were in trn and makes those the category codes
- for df.
- Examples:
- ---------
- >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
- >>> df
- col1 col2
- 0 1 a
- 1 2 b
- 2 3 a
- note the type of col2 is string
- >>> train_cats(df)
- >>> df
- col1 col2
- 0 1 a
- 1 2 b
- 2 3 a
- now the type of col2 is category {a : 1, b : 2}
- >>> df2 = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['b', 'a', 'a']})
- >>> apply_cats(df2, df)
- col1 col2
- 0 1 b
- 1 2 a
- 2 3 a
- now the type of col is category {a : 1, b : 2}
- """
- for n,c in df.items():
- if (n in trn.columns) and (trn[n].dtype.name=='category'):
- df[n] = pd.Categorical(c, categories=trn[n].cat.categories, ordered=True)
- def fix_missing(df, col, name, na_dict):
- """ Fill missing data in a column of df with the median, and add a {name}_na column
- which specifies if the data was missing.
- Parameters:
- -----------
- df: The data frame that will be changed.
- col: The column of data to fix by filling in missing data.
- name: The name of the new filled column in df.
- na_dict: A dictionary of values to create na's of and the value to insert. If
- name is not a key of na_dict the median will fill any missing data. Also
- if name is not a key of na_dict and there is no missing data in col, then
- no {name}_na column is not created.
- Examples:
- ---------
- >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
- >>> df
- col1 col2
- 0 1 5
- 1 nan 2
- 2 3 2
- >>> fix_missing(df, df['col1'], 'col1', {})
- >>> df
- col1 col2 col1_na
- 0 1 5 False
- 1 2 2 True
- 2 3 2 False
- >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
- >>> df
- col1 col2
- 0 1 5
- 1 nan 2
- 2 3 2
- >>> fix_missing(df, df['col2'], 'col2', {})
- >>> df
- col1 col2
- 0 1 5
- 1 nan 2
- 2 3 2
- >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
- >>> df
- col1 col2
- 0 1 5
- 1 nan 2
- 2 3 2
- >>> fix_missing(df, df['col1'], 'col1', {'col1' : 500})
- >>> df
- col1 col2 col1_na
- 0 1 5 False
- 1 500 2 True
- 2 3 2 False
- """
- if is_numeric_dtype(col):
- if pd.isnull(col).sum() or (name in na_dict):
- df[name+'_na'] = pd.isnull(col)
- filler = na_dict[name] if name in na_dict else col.median()
- df[name] = col.fillna(filler)
- na_dict[name] = filler
- return na_dict
- def numericalize(df, col, name, max_n_cat):
- """ Changes the column col from a categorical type to it's integer codes.
- Parameters:
- -----------
- df: A pandas dataframe. df[name] will be filled with the integer codes from
- col.
- col: The column you wish to change into the categories.
- name: The column name you wish to insert into df. This column will hold the
- integer codes.
- max_n_cat: If col has more categories than max_n_cat it will not change the
- it to its integer codes. If max_n_cat is None, then col will always be
- converted.
- Examples:
- ---------
- >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
- >>> df
- col1 col2
- 0 1 a
- 1 2 b
- 2 3 a
- note the type of col2 is string
- >>> train_cats(df)
- >>> df
- col1 col2
- 0 1 a
- 1 2 b
- 2 3 a
- now the type of col2 is category { a : 1, b : 2}
- >>> numericalize(df, df['col2'], 'col3', None)
- col1 col2 col3
- 0 1 a 1
- 1 2 b 2
- 2 3 a 1
- """
- if not is_numeric_dtype(col) and ( max_n_cat is None or col.nunique()>max_n_cat):
- df[name] = col.cat.codes+1
- def scale_vars(df, mapper):
- warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
- if mapper is None:
- map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])]
- mapper = DataFrameMapper(map_f).fit(df)
- df[mapper.transformed_names_] = mapper.transform(df)
- return mapper
- def proc_df(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None,
- preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
- """ proc_df takes a data frame df and splits off the response variable, and
- changes the df into an entirely numeric dataframe.
- Parameters:
- -----------
- df: The data frame you wish to process.
- y_fld: The name of the response variable
- skip_flds: A list of fields that dropped from df.
- ignore_flds: A list of fields that are ignored during processing.
- do_scale: Standardizes each column in df. Takes Boolean Values(True,False)
- na_dict: a dictionary of na columns to add. Na columns are also added if there
- are any missing values.
- preproc_fn: A function that gets applied to df.
- max_n_cat: The maximum number of categories to break into dummy values, instead
- of integer codes.
- subset: Takes a random subset of size subset from df.
- mapper: If do_scale is set as True, the mapper variable
- calculates the values used for scaling of variables during training time (mean and standard deviation).
- Returns:
- --------
- [x, y, nas, mapper(optional)]:
- x: x is the transformed version of df. x will not have the response variable
- and is entirely numeric.
- y: y is the response variable
- nas: returns a dictionary of which nas it created, and the associated median.
- mapper: A DataFrameMapper which stores the mean and standard deviation of the corresponding continuous
- variables which is then used for scaling of during test-time.
- Examples:
- ---------
- >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
- >>> df
- col1 col2
- 0 1 a
- 1 2 b
- 2 3 a
- note the type of col2 is string
- >>> train_cats(df)
- >>> df
- col1 col2
- 0 1 a
- 1 2 b
- 2 3 a
- now the type of col2 is category { a : 1, b : 2}
- >>> x, y, nas = proc_df(df, 'col1')
- >>> x
- col2
- 0 1
- 1 2
- 2 1
- >>> data = DataFrame(pet=["cat", "dog", "dog", "fish", "cat", "dog", "cat", "fish"],
- children=[4., 6, 3, 3, 2, 3, 5, 4],
- salary=[90, 24, 44, 27, 32, 59, 36, 27])
- >>> mapper = DataFrameMapper([(:pet, LabelBinarizer()),
- ([:children], StandardScaler())])
- >>>round(fit_transform!(mapper, copy(data)), 2)
- 8x4 Array{Float64,2}:
- 1.0 0.0 0.0 0.21
- 0.0 1.0 0.0 1.88
- 0.0 1.0 0.0 -0.63
- 0.0 0.0 1.0 -0.63
- 1.0 0.0 0.0 -1.46
- 0.0 1.0 0.0 -0.63
- 1.0 0.0 0.0 1.04
- 0.0 0.0 1.0 0.21
- """
- if not ignore_flds: ignore_flds=[]
- if not skip_flds: skip_flds=[]
- if subset: df = get_sample(df,subset)
- else: df = df.copy()
- ignored_flds = df.loc[:, ignore_flds]
- df.drop(ignore_flds, axis=1, inplace=True)
- if preproc_fn: preproc_fn(df)
- if y_fld is None: y = None
- else:
- if not is_numeric_dtype(df[y_fld]): df[y_fld] = df[y_fld].cat.codes
- y = df[y_fld].values
- skip_flds += [y_fld]
- df.drop(skip_flds, axis=1, inplace=True)
- if na_dict is None: na_dict = {}
- for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
- if do_scale: mapper = scale_vars(df, mapper)
- for n,c in df.items(): numericalize(df, c, n, max_n_cat)
- df = pd.get_dummies(df, dummy_na=True)
- df = pd.concat([ignored_flds, df], axis=1)
- res = [df, y, na_dict]
- if do_scale: res = res + [mapper]
- return res
- def rf_feat_importance(m, df):
- return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
- ).sort_values('imp', ascending=False)
- def set_rf_samples(n):
- """ Changes Scikit learn's random forests to give each tree a random sample of
- n random rows.
- """
- forest._generate_sample_indices = (lambda rs, n_samples:
- forest.check_random_state(rs).randint(0, n_samples, n))
- def reset_rf_samples():
- """ Undoes the changes produced by set_rf_samples.
- """
- forest._generate_sample_indices = (lambda rs, n_samples:
- forest.check_random_state(rs).randint(0, n_samples, n_samples))
- def get_nn_mappers(df, cat_vars, contin_vars):
- # Replace nulls with 0 for continuous, "" for categorical.
- for v in contin_vars: df[v] = df[v].fillna(df[v].max()+100,)
- for v in cat_vars: df[v].fillna('#NA#', inplace=True)
- # list of tuples, containing variable and instance of a transformer for that variable
- # for categoricals, use LabelEncoder to map to integers. For continuous, standardize
- cat_maps = [(o, LabelEncoder()) for o in cat_vars]
- contin_maps = [([o], StandardScaler()) for o in contin_vars]
- return DataFrameMapper(cat_maps).fit(df), DataFrameMapper(contin_maps).fit(df)
|