from .imports import * from sklearn_pandas import DataFrameMapper from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler from pandas.api.types import is_string_dtype, is_numeric_dtype from sklearn.ensemble import forest from sklearn.tree import export_graphviz def set_plot_sizes(sml, med, big): plt.rc('font', size=sml) # controls default text sizes plt.rc('axes', titlesize=sml) # fontsize of the axes title plt.rc('axes', labelsize=med) # fontsize of the x and y labels plt.rc('xtick', labelsize=sml) # fontsize of the tick labels plt.rc('ytick', labelsize=sml) # fontsize of the tick labels plt.rc('legend', fontsize=sml) # legend fontsize plt.rc('figure', titlesize=big) # fontsize of the figure title def parallel_trees(m, fn, n_jobs=8): return list(ProcessPoolExecutor(n_jobs).map(fn, m.estimators_)) def draw_tree(t, df, size=10, ratio=0.6, precision=0): """ Draws a representation of a random forest in IPython. Parameters: ----------- t: The tree you wish to draw df: The data used to train the tree. This is used to get the names of the features. """ s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True, special_characters=True, rotate=True, precision=precision) IPython.display.display(graphviz.Source(re.sub('Tree {', f'Tree {{ size={size}; ratio={ratio}', s))) def combine_date(years, months=1, days=1, weeks=None, hours=None, minutes=None, seconds=None, milliseconds=None, microseconds=None, nanoseconds=None): years = np.asarray(years) - 1970 months = np.asarray(months) - 1 days = np.asarray(days) - 1 types = ('>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']}) >>> df col1 col2 0 1 a 1 2 b 2 3 a >>> get_sample(df, 2) col1 col2 1 2 b 2 3 a """ idxs = sorted(np.random.permutation(len(df))[:n]) return df.iloc[idxs].copy() def add_datepart(df, fldname, drop=True, time=False): """add_datepart converts a column of df from a datetime64 to many columns containing the information from the date. This applies changes inplace. Parameters: ----------- df: A pandas data frame. df gain several new columns. fldname: A string that is the name of the date column you wish to expand. If it is not a datetime64 series, it will be converted to one with pd.to_datetime. drop: If true then the original date column will be removed. time: If true time features: Hour, Minute, Second will be added. Examples: --------- >>> df = pd.DataFrame({ 'A' : pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'], infer_datetime_format=False) }) >>> df A 0 2000-03-11 1 2000-03-12 2 2000-03-13 >>> add_datepart(df, 'A') >>> df AYear AMonth AWeek ADay ADayofweek ADayofyear AIs_month_end AIs_month_start AIs_quarter_end AIs_quarter_start AIs_year_end AIs_year_start AElapsed 0 2000 3 10 11 5 71 False False False False False False 952732800 1 2000 3 10 12 6 72 False False False False False False 952819200 2 2000 3 11 13 0 73 False False False False False False 952905600 """ fld = df[fldname] fld_dtype = fld.dtype if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype): fld_dtype = np.datetime64 if not np.issubdtype(fld_dtype, np.datetime64): df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True) targ_pre = re.sub('[Dd]ate$', '', fldname) attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'] if time: attr = attr + ['Hour', 'Minute', 'Second'] for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower()) df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9 if drop: df.drop(fldname, axis=1, inplace=True) def is_date(x): return np.issubdtype(x.dtype, np.datetime64) def train_cats(df): """Change any columns of strings in a panda's dataframe to a column of catagorical values. This applies the changes inplace. Parameters: ----------- df: A pandas dataframe. Any columns of strings will be changed to categorical values. Examples: --------- >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']}) >>> df col1 col2 0 1 a 1 2 b 2 3 a note the type of col2 is string >>> train_cats(df) >>> df col1 col2 0 1 a 1 2 b 2 3 a now the type of col2 is category """ for n,c in df.items(): if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered() def apply_cats(df, trn): """Changes any columns of strings in df into categorical variables using trn as a template for the category codes. Parameters: ----------- df: A pandas dataframe. Any columns of strings will be changed to categorical values. The category codes are determined by trn. trn: A pandas dataframe. When creating a category for df, it looks up the what the category's code were in trn and makes those the category codes for df. Examples: --------- >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']}) >>> df col1 col2 0 1 a 1 2 b 2 3 a note the type of col2 is string >>> train_cats(df) >>> df col1 col2 0 1 a 1 2 b 2 3 a now the type of col2 is category {a : 1, b : 2} >>> df2 = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['b', 'a', 'a']}) >>> apply_cats(df2, df) col1 col2 0 1 b 1 2 a 2 3 a now the type of col is category {a : 1, b : 2} """ for n,c in df.items(): if (n in trn.columns) and (trn[n].dtype.name=='category'): df[n] = pd.Categorical(c, categories=trn[n].cat.categories, ordered=True) def fix_missing(df, col, name, na_dict): """ Fill missing data in a column of df with the median, and add a {name}_na column which specifies if the data was missing. Parameters: ----------- df: The data frame that will be changed. col: The column of data to fix by filling in missing data. name: The name of the new filled column in df. na_dict: A dictionary of values to create na's of and the value to insert. If name is not a key of na_dict the median will fill any missing data. Also if name is not a key of na_dict and there is no missing data in col, then no {name}_na column is not created. Examples: --------- >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]}) >>> df col1 col2 0 1 5 1 nan 2 2 3 2 >>> fix_missing(df, df['col1'], 'col1', {}) >>> df col1 col2 col1_na 0 1 5 False 1 2 2 True 2 3 2 False >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]}) >>> df col1 col2 0 1 5 1 nan 2 2 3 2 >>> fix_missing(df, df['col2'], 'col2', {}) >>> df col1 col2 0 1 5 1 nan 2 2 3 2 >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]}) >>> df col1 col2 0 1 5 1 nan 2 2 3 2 >>> fix_missing(df, df['col1'], 'col1', {'col1' : 500}) >>> df col1 col2 col1_na 0 1 5 False 1 500 2 True 2 3 2 False """ if is_numeric_dtype(col): if pd.isnull(col).sum() or (name in na_dict): df[name+'_na'] = pd.isnull(col) filler = na_dict[name] if name in na_dict else col.median() df[name] = col.fillna(filler) na_dict[name] = filler return na_dict def numericalize(df, col, name, max_n_cat): """ Changes the column col from a categorical type to it's integer codes. Parameters: ----------- df: A pandas dataframe. df[name] will be filled with the integer codes from col. col: The column you wish to change into the categories. name: The column name you wish to insert into df. This column will hold the integer codes. max_n_cat: If col has more categories than max_n_cat it will not change the it to its integer codes. If max_n_cat is None, then col will always be converted. Examples: --------- >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']}) >>> df col1 col2 0 1 a 1 2 b 2 3 a note the type of col2 is string >>> train_cats(df) >>> df col1 col2 0 1 a 1 2 b 2 3 a now the type of col2 is category { a : 1, b : 2} >>> numericalize(df, df['col2'], 'col3', None) col1 col2 col3 0 1 a 1 1 2 b 2 2 3 a 1 """ if not is_numeric_dtype(col) and ( max_n_cat is None or col.nunique()>max_n_cat): df[name] = col.cat.codes+1 def scale_vars(df, mapper): warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning) if mapper is None: map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])] mapper = DataFrameMapper(map_f).fit(df) df[mapper.transformed_names_] = mapper.transform(df) return mapper def proc_df(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None, preproc_fn=None, max_n_cat=None, subset=None, mapper=None): """ proc_df takes a data frame df and splits off the response variable, and changes the df into an entirely numeric dataframe. Parameters: ----------- df: The data frame you wish to process. y_fld: The name of the response variable skip_flds: A list of fields that dropped from df. ignore_flds: A list of fields that are ignored during processing. do_scale: Standardizes each column in df. Takes Boolean Values(True,False) na_dict: a dictionary of na columns to add. Na columns are also added if there are any missing values. preproc_fn: A function that gets applied to df. max_n_cat: The maximum number of categories to break into dummy values, instead of integer codes. subset: Takes a random subset of size subset from df. mapper: If do_scale is set as True, the mapper variable calculates the values used for scaling of variables during training time (mean and standard deviation). Returns: -------- [x, y, nas, mapper(optional)]: x: x is the transformed version of df. x will not have the response variable and is entirely numeric. y: y is the response variable nas: returns a dictionary of which nas it created, and the associated median. mapper: A DataFrameMapper which stores the mean and standard deviation of the corresponding continuous variables which is then used for scaling of during test-time. Examples: --------- >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']}) >>> df col1 col2 0 1 a 1 2 b 2 3 a note the type of col2 is string >>> train_cats(df) >>> df col1 col2 0 1 a 1 2 b 2 3 a now the type of col2 is category { a : 1, b : 2} >>> x, y, nas = proc_df(df, 'col1') >>> x col2 0 1 1 2 2 1 >>> data = DataFrame(pet=["cat", "dog", "dog", "fish", "cat", "dog", "cat", "fish"], children=[4., 6, 3, 3, 2, 3, 5, 4], salary=[90, 24, 44, 27, 32, 59, 36, 27]) >>> mapper = DataFrameMapper([(:pet, LabelBinarizer()), ([:children], StandardScaler())]) >>>round(fit_transform!(mapper, copy(data)), 2) 8x4 Array{Float64,2}: 1.0 0.0 0.0 0.21 0.0 1.0 0.0 1.88 0.0 1.0 0.0 -0.63 0.0 0.0 1.0 -0.63 1.0 0.0 0.0 -1.46 0.0 1.0 0.0 -0.63 1.0 0.0 0.0 1.04 0.0 0.0 1.0 0.21 """ if not ignore_flds: ignore_flds=[] if not skip_flds: skip_flds=[] if subset: df = get_sample(df,subset) else: df = df.copy() ignored_flds = df.loc[:, ignore_flds] df.drop(ignore_flds, axis=1, inplace=True) if preproc_fn: preproc_fn(df) if y_fld is None: y = None else: if not is_numeric_dtype(df[y_fld]): df[y_fld] = df[y_fld].cat.codes y = df[y_fld].values skip_flds += [y_fld] df.drop(skip_flds, axis=1, inplace=True) if na_dict is None: na_dict = {} for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict) if do_scale: mapper = scale_vars(df, mapper) for n,c in df.items(): numericalize(df, c, n, max_n_cat) df = pd.get_dummies(df, dummy_na=True) df = pd.concat([ignored_flds, df], axis=1) res = [df, y, na_dict] if do_scale: res = res + [mapper] return res def rf_feat_importance(m, df): return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_} ).sort_values('imp', ascending=False) def set_rf_samples(n): """ Changes Scikit learn's random forests to give each tree a random sample of n random rows. """ forest._generate_sample_indices = (lambda rs, n_samples: forest.check_random_state(rs).randint(0, n_samples, n)) def reset_rf_samples(): """ Undoes the changes produced by set_rf_samples. """ forest._generate_sample_indices = (lambda rs, n_samples: forest.check_random_state(rs).randint(0, n_samples, n_samples)) def get_nn_mappers(df, cat_vars, contin_vars): # Replace nulls with 0 for continuous, "" for categorical. for v in contin_vars: df[v] = df[v].fillna(df[v].max()+100,) for v in cat_vars: df[v].fillna('#NA#', inplace=True) # list of tuples, containing variable and instance of a transformer for that variable # for categoricals, use LabelEncoder to map to integers. For continuous, standardize cat_maps = [(o, LabelEncoder()) for o in cat_vars] contin_maps = [([o], StandardScaler()) for o in contin_vars] return DataFrameMapper(cat_maps).fit(df), DataFrameMapper(contin_maps).fit(df)