structured.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475
  1. from .imports import *
  2. from sklearn_pandas import DataFrameMapper
  3. from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler
  4. from pandas.api.types import is_string_dtype, is_numeric_dtype
  5. from sklearn.ensemble import forest
  6. from sklearn.tree import export_graphviz
  7. def set_plot_sizes(sml, med, big):
  8. plt.rc('font', size=sml) # controls default text sizes
  9. plt.rc('axes', titlesize=sml) # fontsize of the axes title
  10. plt.rc('axes', labelsize=med) # fontsize of the x and y labels
  11. plt.rc('xtick', labelsize=sml) # fontsize of the tick labels
  12. plt.rc('ytick', labelsize=sml) # fontsize of the tick labels
  13. plt.rc('legend', fontsize=sml) # legend fontsize
  14. plt.rc('figure', titlesize=big) # fontsize of the figure title
  15. def parallel_trees(m, fn, n_jobs=8):
  16. return list(ProcessPoolExecutor(n_jobs).map(fn, m.estimators_))
  17. def draw_tree(t, df, size=10, ratio=0.6, precision=0):
  18. """ Draws a representation of a random forest in IPython.
  19. Parameters:
  20. -----------
  21. t: The tree you wish to draw
  22. df: The data used to train the tree. This is used to get the names of the features.
  23. """
  24. s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True,
  25. special_characters=True, rotate=True, precision=precision)
  26. IPython.display.display(graphviz.Source(re.sub('Tree {',
  27. f'Tree {{ size={size}; ratio={ratio}', s)))
  28. def combine_date(years, months=1, days=1, weeks=None, hours=None, minutes=None,
  29. seconds=None, milliseconds=None, microseconds=None, nanoseconds=None):
  30. years = np.asarray(years) - 1970
  31. months = np.asarray(months) - 1
  32. days = np.asarray(days) - 1
  33. types = ('<M8[Y]', '<m8[M]', '<m8[D]', '<m8[W]', '<m8[h]',
  34. '<m8[m]', '<m8[s]', '<m8[ms]', '<m8[us]', '<m8[ns]')
  35. vals = (years, months, days, weeks, hours, minutes, seconds,
  36. milliseconds, microseconds, nanoseconds)
  37. return sum(np.asarray(v, dtype=t) for t, v in zip(types, vals)
  38. if v is not None)
  39. def get_sample(df,n):
  40. """ Gets a random sample of n rows from df, without replacement.
  41. Parameters:
  42. -----------
  43. df: A pandas data frame, that you wish to sample from.
  44. n: The number of rows you wish to sample.
  45. Returns:
  46. --------
  47. return value: A random sample of n rows of df.
  48. Examples:
  49. ---------
  50. >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
  51. >>> df
  52. col1 col2
  53. 0 1 a
  54. 1 2 b
  55. 2 3 a
  56. >>> get_sample(df, 2)
  57. col1 col2
  58. 1 2 b
  59. 2 3 a
  60. """
  61. idxs = sorted(np.random.permutation(len(df))[:n])
  62. return df.iloc[idxs].copy()
  63. def add_datepart(df, fldname, drop=True, time=False):
  64. """add_datepart converts a column of df from a datetime64 to many columns containing
  65. the information from the date. This applies changes inplace.
  66. Parameters:
  67. -----------
  68. df: A pandas data frame. df gain several new columns.
  69. fldname: A string that is the name of the date column you wish to expand.
  70. If it is not a datetime64 series, it will be converted to one with pd.to_datetime.
  71. drop: If true then the original date column will be removed.
  72. time: If true time features: Hour, Minute, Second will be added.
  73. Examples:
  74. ---------
  75. >>> df = pd.DataFrame({ 'A' : pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'], infer_datetime_format=False) })
  76. >>> df
  77. A
  78. 0 2000-03-11
  79. 1 2000-03-12
  80. 2 2000-03-13
  81. >>> add_datepart(df, 'A')
  82. >>> df
  83. AYear AMonth AWeek ADay ADayofweek ADayofyear AIs_month_end AIs_month_start AIs_quarter_end AIs_quarter_start AIs_year_end AIs_year_start AElapsed
  84. 0 2000 3 10 11 5 71 False False False False False False 952732800
  85. 1 2000 3 10 12 6 72 False False False False False False 952819200
  86. 2 2000 3 11 13 0 73 False False False False False False 952905600
  87. """
  88. fld = df[fldname]
  89. fld_dtype = fld.dtype
  90. if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
  91. fld_dtype = np.datetime64
  92. if not np.issubdtype(fld_dtype, np.datetime64):
  93. df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
  94. targ_pre = re.sub('[Dd]ate$', '', fldname)
  95. attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
  96. 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
  97. if time: attr = attr + ['Hour', 'Minute', 'Second']
  98. for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
  99. df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
  100. if drop: df.drop(fldname, axis=1, inplace=True)
  101. def is_date(x): return np.issubdtype(x.dtype, np.datetime64)
  102. def train_cats(df):
  103. """Change any columns of strings in a panda's dataframe to a column of
  104. catagorical values. This applies the changes inplace.
  105. Parameters:
  106. -----------
  107. df: A pandas dataframe. Any columns of strings will be changed to
  108. categorical values.
  109. Examples:
  110. ---------
  111. >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
  112. >>> df
  113. col1 col2
  114. 0 1 a
  115. 1 2 b
  116. 2 3 a
  117. note the type of col2 is string
  118. >>> train_cats(df)
  119. >>> df
  120. col1 col2
  121. 0 1 a
  122. 1 2 b
  123. 2 3 a
  124. now the type of col2 is category
  125. """
  126. for n,c in df.items():
  127. if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()
  128. def apply_cats(df, trn):
  129. """Changes any columns of strings in df into categorical variables using trn as
  130. a template for the category codes.
  131. Parameters:
  132. -----------
  133. df: A pandas dataframe. Any columns of strings will be changed to
  134. categorical values. The category codes are determined by trn.
  135. trn: A pandas dataframe. When creating a category for df, it looks up the
  136. what the category's code were in trn and makes those the category codes
  137. for df.
  138. Examples:
  139. ---------
  140. >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
  141. >>> df
  142. col1 col2
  143. 0 1 a
  144. 1 2 b
  145. 2 3 a
  146. note the type of col2 is string
  147. >>> train_cats(df)
  148. >>> df
  149. col1 col2
  150. 0 1 a
  151. 1 2 b
  152. 2 3 a
  153. now the type of col2 is category {a : 1, b : 2}
  154. >>> df2 = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['b', 'a', 'a']})
  155. >>> apply_cats(df2, df)
  156. col1 col2
  157. 0 1 b
  158. 1 2 a
  159. 2 3 a
  160. now the type of col is category {a : 1, b : 2}
  161. """
  162. for n,c in df.items():
  163. if (n in trn.columns) and (trn[n].dtype.name=='category'):
  164. df[n] = pd.Categorical(c, categories=trn[n].cat.categories, ordered=True)
  165. def fix_missing(df, col, name, na_dict):
  166. """ Fill missing data in a column of df with the median, and add a {name}_na column
  167. which specifies if the data was missing.
  168. Parameters:
  169. -----------
  170. df: The data frame that will be changed.
  171. col: The column of data to fix by filling in missing data.
  172. name: The name of the new filled column in df.
  173. na_dict: A dictionary of values to create na's of and the value to insert. If
  174. name is not a key of na_dict the median will fill any missing data. Also
  175. if name is not a key of na_dict and there is no missing data in col, then
  176. no {name}_na column is not created.
  177. Examples:
  178. ---------
  179. >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
  180. >>> df
  181. col1 col2
  182. 0 1 5
  183. 1 nan 2
  184. 2 3 2
  185. >>> fix_missing(df, df['col1'], 'col1', {})
  186. >>> df
  187. col1 col2 col1_na
  188. 0 1 5 False
  189. 1 2 2 True
  190. 2 3 2 False
  191. >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
  192. >>> df
  193. col1 col2
  194. 0 1 5
  195. 1 nan 2
  196. 2 3 2
  197. >>> fix_missing(df, df['col2'], 'col2', {})
  198. >>> df
  199. col1 col2
  200. 0 1 5
  201. 1 nan 2
  202. 2 3 2
  203. >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
  204. >>> df
  205. col1 col2
  206. 0 1 5
  207. 1 nan 2
  208. 2 3 2
  209. >>> fix_missing(df, df['col1'], 'col1', {'col1' : 500})
  210. >>> df
  211. col1 col2 col1_na
  212. 0 1 5 False
  213. 1 500 2 True
  214. 2 3 2 False
  215. """
  216. if is_numeric_dtype(col):
  217. if pd.isnull(col).sum() or (name in na_dict):
  218. df[name+'_na'] = pd.isnull(col)
  219. filler = na_dict[name] if name in na_dict else col.median()
  220. df[name] = col.fillna(filler)
  221. na_dict[name] = filler
  222. return na_dict
  223. def numericalize(df, col, name, max_n_cat):
  224. """ Changes the column col from a categorical type to it's integer codes.
  225. Parameters:
  226. -----------
  227. df: A pandas dataframe. df[name] will be filled with the integer codes from
  228. col.
  229. col: The column you wish to change into the categories.
  230. name: The column name you wish to insert into df. This column will hold the
  231. integer codes.
  232. max_n_cat: If col has more categories than max_n_cat it will not change the
  233. it to its integer codes. If max_n_cat is None, then col will always be
  234. converted.
  235. Examples:
  236. ---------
  237. >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
  238. >>> df
  239. col1 col2
  240. 0 1 a
  241. 1 2 b
  242. 2 3 a
  243. note the type of col2 is string
  244. >>> train_cats(df)
  245. >>> df
  246. col1 col2
  247. 0 1 a
  248. 1 2 b
  249. 2 3 a
  250. now the type of col2 is category { a : 1, b : 2}
  251. >>> numericalize(df, df['col2'], 'col3', None)
  252. col1 col2 col3
  253. 0 1 a 1
  254. 1 2 b 2
  255. 2 3 a 1
  256. """
  257. if not is_numeric_dtype(col) and ( max_n_cat is None or col.nunique()>max_n_cat):
  258. df[name] = col.cat.codes+1
  259. def scale_vars(df, mapper):
  260. warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
  261. if mapper is None:
  262. map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])]
  263. mapper = DataFrameMapper(map_f).fit(df)
  264. df[mapper.transformed_names_] = mapper.transform(df)
  265. return mapper
  266. def proc_df(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None,
  267. preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
  268. """ proc_df takes a data frame df and splits off the response variable, and
  269. changes the df into an entirely numeric dataframe.
  270. Parameters:
  271. -----------
  272. df: The data frame you wish to process.
  273. y_fld: The name of the response variable
  274. skip_flds: A list of fields that dropped from df.
  275. ignore_flds: A list of fields that are ignored during processing.
  276. do_scale: Standardizes each column in df. Takes Boolean Values(True,False)
  277. na_dict: a dictionary of na columns to add. Na columns are also added if there
  278. are any missing values.
  279. preproc_fn: A function that gets applied to df.
  280. max_n_cat: The maximum number of categories to break into dummy values, instead
  281. of integer codes.
  282. subset: Takes a random subset of size subset from df.
  283. mapper: If do_scale is set as True, the mapper variable
  284. calculates the values used for scaling of variables during training time (mean and standard deviation).
  285. Returns:
  286. --------
  287. [x, y, nas, mapper(optional)]:
  288. x: x is the transformed version of df. x will not have the response variable
  289. and is entirely numeric.
  290. y: y is the response variable
  291. nas: returns a dictionary of which nas it created, and the associated median.
  292. mapper: A DataFrameMapper which stores the mean and standard deviation of the corresponding continuous
  293. variables which is then used for scaling of during test-time.
  294. Examples:
  295. ---------
  296. >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
  297. >>> df
  298. col1 col2
  299. 0 1 a
  300. 1 2 b
  301. 2 3 a
  302. note the type of col2 is string
  303. >>> train_cats(df)
  304. >>> df
  305. col1 col2
  306. 0 1 a
  307. 1 2 b
  308. 2 3 a
  309. now the type of col2 is category { a : 1, b : 2}
  310. >>> x, y, nas = proc_df(df, 'col1')
  311. >>> x
  312. col2
  313. 0 1
  314. 1 2
  315. 2 1
  316. >>> data = DataFrame(pet=["cat", "dog", "dog", "fish", "cat", "dog", "cat", "fish"],
  317. children=[4., 6, 3, 3, 2, 3, 5, 4],
  318. salary=[90, 24, 44, 27, 32, 59, 36, 27])
  319. >>> mapper = DataFrameMapper([(:pet, LabelBinarizer()),
  320. ([:children], StandardScaler())])
  321. >>>round(fit_transform!(mapper, copy(data)), 2)
  322. 8x4 Array{Float64,2}:
  323. 1.0 0.0 0.0 0.21
  324. 0.0 1.0 0.0 1.88
  325. 0.0 1.0 0.0 -0.63
  326. 0.0 0.0 1.0 -0.63
  327. 1.0 0.0 0.0 -1.46
  328. 0.0 1.0 0.0 -0.63
  329. 1.0 0.0 0.0 1.04
  330. 0.0 0.0 1.0 0.21
  331. """
  332. if not ignore_flds: ignore_flds=[]
  333. if not skip_flds: skip_flds=[]
  334. if subset: df = get_sample(df,subset)
  335. else: df = df.copy()
  336. ignored_flds = df.loc[:, ignore_flds]
  337. df.drop(ignore_flds, axis=1, inplace=True)
  338. if preproc_fn: preproc_fn(df)
  339. if y_fld is None: y = None
  340. else:
  341. if not is_numeric_dtype(df[y_fld]): df[y_fld] = df[y_fld].cat.codes
  342. y = df[y_fld].values
  343. skip_flds += [y_fld]
  344. df.drop(skip_flds, axis=1, inplace=True)
  345. if na_dict is None: na_dict = {}
  346. for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
  347. if do_scale: mapper = scale_vars(df, mapper)
  348. for n,c in df.items(): numericalize(df, c, n, max_n_cat)
  349. df = pd.get_dummies(df, dummy_na=True)
  350. df = pd.concat([ignored_flds, df], axis=1)
  351. res = [df, y, na_dict]
  352. if do_scale: res = res + [mapper]
  353. return res
  354. def rf_feat_importance(m, df):
  355. return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
  356. ).sort_values('imp', ascending=False)
  357. def set_rf_samples(n):
  358. """ Changes Scikit learn's random forests to give each tree a random sample of
  359. n random rows.
  360. """
  361. forest._generate_sample_indices = (lambda rs, n_samples:
  362. forest.check_random_state(rs).randint(0, n_samples, n))
  363. def reset_rf_samples():
  364. """ Undoes the changes produced by set_rf_samples.
  365. """
  366. forest._generate_sample_indices = (lambda rs, n_samples:
  367. forest.check_random_state(rs).randint(0, n_samples, n_samples))
  368. def get_nn_mappers(df, cat_vars, contin_vars):
  369. # Replace nulls with 0 for continuous, "" for categorical.
  370. for v in contin_vars: df[v] = df[v].fillna(df[v].max()+100,)
  371. for v in cat_vars: df[v].fillna('#NA#', inplace=True)
  372. # list of tuples, containing variable and instance of a transformer for that variable
  373. # for categoricals, use LabelEncoder to map to integers. For continuous, standardize
  374. cat_maps = [(o, LabelEncoder()) for o in cat_vars]
  375. contin_maps = [([o], StandardScaler()) for o in contin_vars]
  376. return DataFrameMapper(cat_maps).fit(df), DataFrameMapper(contin_maps).fit(df)