tensorboard.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. import fastai
  2. from fastai import *
  3. from fastai.vision import *
  4. from fastai.callbacks import *
  5. from fastai.vision.gan import *
  6. from fastai.core import *
  7. import statistics
  8. from .images import ModelImageSet
  9. import torchvision.utils as vutils
  10. from tensorboardX import SummaryWriter
  11. class ModelGraphVisualizer():
  12. def __init__(self):
  13. return
  14. def write_model_graph_to_tensorboard(self, md: DataBunch, model: nn.Module, tbwriter: SummaryWriter):
  15. try:
  16. x, y = md.one_batch(DatasetType.Valid, detach=False, denorm=False)
  17. tbwriter.add_graph(model, x)
  18. except Exception as e:
  19. print(("Failed to generate graph for model: {0}. Note that there's an outstanding issue with "
  20. + "scopes being addressed here: https://github.com/pytorch/pytorch/pull/12400").format(e))
  21. class ModelHistogramVisualizer():
  22. def __init__(self):
  23. return
  24. def write_tensorboard_histograms(self, model: nn.Module, iteration: int, tbwriter: SummaryWriter, name: str = 'model'):
  25. try:
  26. for param_name, param in model.named_parameters():
  27. tbwriter.add_histogram(
  28. name + '/weights/' + param_name, param, iteration)
  29. except Exception as e:
  30. print(("Failed to update histogram for model: {0}").format(e))
  31. class ModelStatsVisualizer():
  32. def __init__(self):
  33. return
  34. def write_tensorboard_stats(self, model: nn.Module, iteration: int, tbwriter: SummaryWriter, name: str = 'model_stats'):
  35. try:
  36. gradients = [x.grad for x in model.parameters()
  37. if x.grad is not None]
  38. gradient_nps = [to_np(x.data) for x in gradients]
  39. if len(gradients) == 0:
  40. return
  41. avg_norm = sum(x.data.norm() for x in gradients)/len(gradients)
  42. tbwriter.add_scalar(
  43. name + '/gradients/avg_norm', avg_norm, iteration)
  44. median_norm = statistics.median(x.data.norm() for x in gradients)
  45. tbwriter.add_scalar(
  46. name + '/gradients/median_norm', median_norm, iteration)
  47. max_norm = max(x.data.norm() for x in gradients)
  48. tbwriter.add_scalar(
  49. name + '/gradients/max_norm', max_norm, iteration)
  50. min_norm = min(x.data.norm() for x in gradients)
  51. tbwriter.add_scalar(
  52. name + '/gradients/min_norm', min_norm, iteration)
  53. num_zeros = sum((np.asarray(x) == 0.0).sum() for x in gradient_nps)
  54. tbwriter.add_scalar(
  55. name + '/gradients/num_zeros', num_zeros, iteration)
  56. avg_gradient = sum(x.data.mean() for x in gradients)/len(gradients)
  57. tbwriter.add_scalar(
  58. name + '/gradients/avg_gradient', avg_gradient, iteration)
  59. median_gradient = statistics.median(
  60. x.data.median() for x in gradients)
  61. tbwriter.add_scalar(
  62. name + '/gradients/median_gradient', median_gradient, iteration)
  63. max_gradient = max(x.data.max() for x in gradients)
  64. tbwriter.add_scalar(
  65. name + '/gradients/max_gradient', max_gradient, iteration)
  66. min_gradient = min(x.data.min() for x in gradients)
  67. tbwriter.add_scalar(
  68. name + '/gradients/min_gradient', min_gradient, iteration)
  69. except Exception as e:
  70. print(
  71. ("Failed to update tensorboard stats for model: {0}").format(e))
  72. class ImageGenVisualizer():
  73. def output_image_gen_visuals(self, learn: Learner, trn_batch: Tuple, val_batch: Tuple, iteration: int, tbwriter: SummaryWriter):
  74. self._output_visuals(learn=learn, batch=val_batch, iteration=iteration,
  75. tbwriter=tbwriter, ds_type=DatasetType.Valid)
  76. self._output_visuals(learn=learn, batch=trn_batch, iteration=iteration,
  77. tbwriter=tbwriter, ds_type=DatasetType.Train)
  78. def _output_visuals(self, learn: Learner, batch: Tuple, iteration: int, tbwriter: SummaryWriter, ds_type: DatasetType):
  79. image_sets = ModelImageSet.get_list_from_model(
  80. learn=learn, batch=batch, ds_type=ds_type)
  81. self._write_tensorboard_images(
  82. image_sets=image_sets, iteration=iteration, tbwriter=tbwriter, ds_type=ds_type)
  83. def _write_tensorboard_images(self, image_sets: [ModelImageSet], iteration: int, tbwriter: SummaryWriter, ds_type: DatasetType):
  84. try:
  85. orig_images = []
  86. gen_images = []
  87. real_images = []
  88. for image_set in image_sets:
  89. orig_images.append(image_set.orig.px)
  90. gen_images.append(image_set.gen.px)
  91. real_images.append(image_set.real.px)
  92. prefix = str(ds_type)
  93. tbwriter.add_image(
  94. prefix + ' orig images', vutils.make_grid(orig_images, normalize=True), iteration)
  95. tbwriter.add_image(
  96. prefix + ' gen images', vutils.make_grid(gen_images, normalize=True), iteration)
  97. tbwriter.add_image(
  98. prefix + ' real images', vutils.make_grid(real_images, normalize=True), iteration)
  99. except Exception as e:
  100. print(
  101. ("Failed to update tensorboard images for model: {0}").format(e))
  102. #--------Below are what you actually want to use, in practice----------------#
  103. class LearnerTensorboardWriter(LearnerCallback):
  104. def __init__(self, learn: Learner, base_dir: Path, name: str, loss_iters: int = 25, weight_iters: int = 1000, stats_iters: int = 1000):
  105. super().__init__(learn=learn)
  106. self.base_dir = base_dir
  107. self.name = name
  108. log_dir = base_dir/name
  109. self.tbwriter = SummaryWriter(log_dir=str(log_dir))
  110. self.loss_iters = loss_iters
  111. self.weight_iters = weight_iters
  112. self.stats_iters = stats_iters
  113. self.weight_vis = ModelHistogramVisualizer()
  114. self.model_vis = ModelStatsVisualizer()
  115. self.data = None
  116. self.metrics_root = '/metrics/'
  117. def _update_batches_if_needed(self):
  118. # one_batch function is extremely slow. this is an optimization
  119. update_batches = self.data is not self.learn.data
  120. if update_batches:
  121. self.data = self.learn.data
  122. self.trn_batch = self.learn.data.one_batch(
  123. DatasetType.Train, detach=True, denorm=False, cpu=False)
  124. self.val_batch = self.learn.data.one_batch(
  125. DatasetType.Valid, detach=True, denorm=False, cpu=False)
  126. def _write_model_stats(self, iteration):
  127. self.model_vis.write_tensorboard_stats(
  128. model=self.learn.model, iteration=iteration, tbwriter=self.tbwriter)
  129. def _write_training_loss(self, iteration, last_loss):
  130. trn_loss = to_np(last_loss)
  131. self.tbwriter.add_scalar(
  132. self.metrics_root + 'train_loss', trn_loss, iteration)
  133. def _write_weight_histograms(self, iteration):
  134. self.weight_vis.write_tensorboard_histograms(
  135. model=self.learn.model, iteration=iteration, tbwriter=self.tbwriter)
  136. def _write_metrics(self, iteration, last_metrics, start_idx: int = 2):
  137. recorder = self.learn.recorder
  138. for i, name in enumerate(recorder.names[start_idx:]):
  139. if len(last_metrics) < i+1:
  140. return
  141. value = last_metrics[i]
  142. self.tbwriter.add_scalar(
  143. self.metrics_root + name, value, iteration)
  144. def on_batch_end(self, last_loss, metrics, iteration, **kwargs):
  145. if iteration == 0:
  146. return
  147. self._update_batches_if_needed()
  148. if iteration % self.loss_iters == 0:
  149. self._write_training_loss(iteration, last_loss)
  150. if iteration % self.weight_iters == 0:
  151. self._write_weight_histograms(iteration)
  152. # Doing stuff here that requires gradient info, because they get zeroed out afterwards in training loop
  153. def on_backward_end(self, iteration, **kwargs):
  154. if iteration == 0:
  155. return
  156. self._update_batches_if_needed()
  157. if iteration % self.stats_iters == 0:
  158. self._write_model_stats(iteration)
  159. def on_epoch_end(self, metrics, last_metrics, iteration, **kwargs):
  160. self._write_metrics(iteration, last_metrics)
  161. class GANTensorboardWriter(LearnerTensorboardWriter):
  162. def __init__(self, learn: Learner, base_dir: Path, name: str, loss_iters: int = 25, weight_iters: int = 1000,
  163. stats_iters: int = 1000, visual_iters: int = 100):
  164. super().__init__(learn=learn, base_dir=base_dir, name=name, loss_iters=loss_iters,
  165. weight_iters=weight_iters, stats_iters=stats_iters)
  166. self.visual_iters = visual_iters
  167. self.img_gen_vis = ImageGenVisualizer()
  168. self.gen_stats_updated = True
  169. self.crit_stats_updated = True
  170. # override
  171. def _write_weight_histograms(self, iteration):
  172. trainer = self.learn.gan_trainer
  173. generator = trainer.generator
  174. critic = trainer.critic
  175. self.weight_vis.write_tensorboard_histograms(
  176. model=generator, iteration=iteration, tbwriter=self.tbwriter, name='generator')
  177. self.weight_vis.write_tensorboard_histograms(
  178. model=critic, iteration=iteration, tbwriter=self.tbwriter, name='critic')
  179. # override
  180. def _write_model_stats(self, iteration):
  181. trainer = self.learn.gan_trainer
  182. generator = trainer.generator
  183. critic = trainer.critic
  184. # Don't want to write stats when model has zeroed out gradients
  185. gen_mode = trainer.gen_mode
  186. if gen_mode:
  187. self.model_vis.write_tensorboard_stats(
  188. model=generator, iteration=iteration, tbwriter=self.tbwriter, name='gen_model_stats')
  189. self.gen_stats_updated = True
  190. else:
  191. self.model_vis.write_tensorboard_stats(
  192. model=critic, iteration=iteration, tbwriter=self.tbwriter, name='crit_model_stats')
  193. self.crit_stats_updated = True
  194. # override
  195. def _write_training_loss(self, iteration, last_loss):
  196. trainer = self.learn.gan_trainer
  197. recorder = trainer.recorder
  198. if len(recorder.losses) > 0:
  199. trn_loss = to_np((recorder.losses[-1:])[0])
  200. self.tbwriter.add_scalar(
  201. self.metrics_root + 'train_loss', trn_loss, iteration)
  202. def _write_images(self, iteration):
  203. trainer = self.learn.gan_trainer
  204. gen_mode = trainer.gen_mode
  205. trainer.switch(gen_mode=True)
  206. self.img_gen_vis.output_image_gen_visuals(learn=self.learn, trn_batch=self.trn_batch, val_batch=self.val_batch,
  207. iteration=iteration, tbwriter=self.tbwriter)
  208. trainer.switch(gen_mode=gen_mode)
  209. # override
  210. def on_batch_end(self, metrics, iteration, **kwargs):
  211. super().on_batch_end(metrics=metrics, iteration=iteration, **kwargs)
  212. if iteration == 0:
  213. return
  214. if iteration % self.visual_iters == 0:
  215. self._write_images(iteration)
  216. # override
  217. def on_backward_end(self, iteration, **kwargs):
  218. if iteration == 0:
  219. return
  220. self._update_batches_if_needed()
  221. if iteration % self.stats_iters == 0:
  222. self.gen_stats_updated = False
  223. self.crit_stats_updated = False
  224. if not (self.gen_stats_updated and self.crit_stats_updated):
  225. self._write_model_stats(iteration)
  226. class ImageGenTensorboardWriter(LearnerTensorboardWriter):
  227. def __init__(self, learn: Learner, base_dir: Path, name: str, loss_iters: int = 25, weight_iters: int = 1000,
  228. stats_iters: int = 1000, visual_iters: int = 100):
  229. super().__init__(learn=learn, base_dir=base_dir, name=name, loss_iters=loss_iters, weight_iters=weight_iters,
  230. stats_iters=stats_iters)
  231. self.visual_iters = visual_iters
  232. self.img_gen_vis = ImageGenVisualizer()
  233. def _write_images(self, iteration):
  234. self.img_gen_vis.output_image_gen_visuals(learn=self.learn, trn_batch=self.trn_batch, val_batch=self.val_batch,
  235. iteration=iteration, tbwriter=self.tbwriter)
  236. # override
  237. def on_batch_end(self, metrics, iteration, **kwargs):
  238. super().on_batch_end(metrics=metrics, iteration=iteration, **kwargs)
  239. if iteration == 0:
  240. return
  241. if iteration % self.visual_iters == 0:
  242. self._write_images(iteration)