python
/
DeOldify


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609
							from .imports import *
from .layer_optimizer import *
from enum import IntEnum
from timeit import default_timer as timer
import copy


class Callback:
    '''
    An abstract class that all callback(e.g., LossRecorder) classes extends from. 
    Must be extended before usage.
    '''
    def on_train_begin(self): pass
    def on_batch_begin(self): pass
    def on_phase_begin(self): pass
    def on_epoch_end(self, metrics): pass
    def on_phase_end(self): pass
    def on_batch_end(self, metrics): pass
    def on_train_end(self): pass

# Useful for maintaining status of a long-running job.
# 
# Usage:
# learn.fit(0.01, 1, callbacks = [LoggingCallback(save_path="/tmp/log")])
class LoggingCallback(Callback):
    '''
    A class useful for maintaining status of a long-running job.
    e.g.: learn.fit(0.01, 1, callbacks = [LoggingCallback(save_path="/tmp/log")])
    '''
    def __init__(self, save_path):
        super().__init__()
        self.save_path=save_path
    def on_train_begin(self):
        self.batch = 0
        self.epoch = 0
        self.phase = 0
        self.f = open(self.save_path, "a", 1)
        self.log("\ton_train_begin")
    def on_batch_begin(self):
        self.log(str(self.batch)+"\ton_batch_begin")
    def on_phase_begin(self):
        self.log(str(self.phase)+"\ton_phase_begin")
    def on_epoch_end(self, metrics):
        self.log(str(self.epoch)+"\ton_epoch_end: "+str(metrics))
        self.epoch += 1
    def on_phase_end(self):
        self.log(str(self.phase)+"\ton_phase_end")
        self.phase+=1
    def on_batch_end(self, metrics):
        self.log(str(self.batch)+"\ton_batch_end: "+str(metrics))
        self.batch += 1
    def on_train_end(self):
        self.log("\ton_train_end")
        self.f.close()
    def log(self, string):
        self.f.write(time.strftime("%Y-%m-%dT%H:%M:%S")+"\t"+string+"\n")
        
class LossRecorder(Callback):
    '''
    Saves and displays loss functions and other metrics. 
    Default sched when none is specified in a learner. 
    '''
    def __init__(self, layer_opt, save_path='', record_mom=False, metrics=[]):
        super().__init__()
        self.layer_opt=layer_opt
        self.init_lrs=np.array(layer_opt.lrs)
        self.save_path, self.record_mom, self.metrics = save_path, record_mom, metrics

    def on_train_begin(self):
        self.losses,self.lrs,self.iterations,self.epochs,self.times = [],[],[],[],[]
        self.start_at = timer()
        self.val_losses, self.rec_metrics = [], []
        if self.record_mom:
            self.momentums = []
        self.iteration = 0
        self.epoch = 0

    def on_epoch_end(self, metrics):
        self.epoch += 1
        self.epochs.append(self.iteration)
        self.times.append(timer() - self.start_at)
        self.save_metrics(metrics)

    def on_batch_end(self, loss):
        self.iteration += 1
        self.lrs.append(self.layer_opt.lr)
        self.iterations.append(self.iteration)
        if isinstance(loss, list):
            self.losses.append(loss[0])
            self.save_metrics(loss[1:])
        else: self.losses.append(loss)
        if self.record_mom: self.momentums.append(self.layer_opt.mom)

    def save_metrics(self,vals):
        self.val_losses.append(delistify(vals[0]))
        if len(vals) > 2: self.rec_metrics.append(vals[1:])
        elif len(vals) == 2: self.rec_metrics.append(vals[1])

    def plot_loss(self, n_skip=10, n_skip_end=5):
        '''
        plots loss function as function of iterations. 
        When used in Jupyternotebook, plot will be displayed in notebook. Else, plot will be displayed in console and both plot and loss are saved in save_path. 
        '''
        if not in_ipynb(): plt.switch_backend('agg')
        plt.plot(self.iterations[n_skip:-n_skip_end], self.losses[n_skip:-n_skip_end])
        if not in_ipynb():
            plt.savefig(os.path.join(self.save_path, 'loss_plot.png'))
            np.save(os.path.join(self.save_path, 'losses.npy'), self.losses[10:])

    def plot_lr(self):
        '''Plots learning rate in jupyter notebook or console, depending on the enviroment of the learner.'''
        if not in_ipynb():
            plt.switch_backend('agg')
        if self.record_mom:
            fig, axs = plt.subplots(1,2,figsize=(12,4))
            for i in range(0,2): axs[i].set_xlabel('iterations')
            axs[0].set_ylabel('learning rate')
            axs[1].set_ylabel('momentum')
            axs[0].plot(self.iterations,self.lrs)
            axs[1].plot(self.iterations,self.momentums)   
        else:
            plt.xlabel("iterations")
            plt.ylabel("learning rate")
            plt.plot(self.iterations, self.lrs)
        if not in_ipynb():
            plt.savefig(os.path.join(self.save_path, 'lr_plot.png'))


class LR_Updater(LossRecorder):
    '''
    Abstract class where all Learning Rate updaters inherit from. (e.g., CirularLR)
    Calculates and updates new learning rate and momentum at the end of each batch. 
    Have to be extended. 
    '''
    def on_train_begin(self):
        super().on_train_begin()
        self.update_lr()
        if self.record_mom:
            self.update_mom()

    def on_batch_end(self, loss):
        res = super().on_batch_end(loss)
        self.update_lr()
        if self.record_mom:
            self.update_mom()
        return res

    def update_lr(self):
        new_lrs = self.calc_lr(self.init_lrs)
        self.layer_opt.set_lrs(new_lrs)
    
    def update_mom(self):
        new_mom = self.calc_mom()
        self.layer_opt.set_mom(new_mom)

    @abstractmethod
    def calc_lr(self, init_lrs): raise NotImplementedError
    
    @abstractmethod
    def calc_mom(self): raise NotImplementedError


class LR_Finder(LR_Updater):
    '''
    Helps you find an optimal learning rate for a model, as per suggetion of 2015 CLR paper. 
    Learning rate is increased in linear or log scale, depending on user input, and the result of the loss funciton is retained and can be plotted later. 
    '''
    def __init__(self, layer_opt, nb, end_lr=10, linear=False, metrics = []):
        self.linear, self.stop_dv = linear, True
        ratio = end_lr/layer_opt.lr
        self.lr_mult = (ratio/nb) if linear else ratio**(1/nb)
        super().__init__(layer_opt,metrics=metrics)

    def on_train_begin(self):
        super().on_train_begin()
        self.best=1e9

    def calc_lr(self, init_lrs):
        mult = self.lr_mult*self.iteration if self.linear else self.lr_mult**self.iteration
        return init_lrs * mult

    def on_batch_end(self, metrics):
        loss = metrics[0] if isinstance(metrics,list) else metrics
        if self.stop_dv and (math.isnan(loss) or loss>self.best*4):
            return True
        if (loss<self.best and self.iteration>10): self.best=loss
        return super().on_batch_end(metrics)

    def plot(self, n_skip=10, n_skip_end=5):
        '''
        Plots the loss function with respect to learning rate, in log scale. 
        '''
        plt.ylabel("validation loss")
        plt.xlabel("learning rate (log scale)")
        plt.plot(self.lrs[n_skip:-(n_skip_end+1)], self.losses[n_skip:-(n_skip_end+1)])
        plt.xscale('log')
        plt.savefig(os.path.join(self.save_path, 'lr_loss_plot.png'))

class LR_Finder2(LR_Finder):
    """
        A variant of lr_find() that helps find the best learning rate. It doesn't do
        an epoch but a fixed num of iterations (which may be more or less than an epoch
        depending on your data).
    """
    def __init__(self, layer_opt, nb, end_lr=10, linear=False, metrics=[], stop_dv=True):
        self.nb, self.metrics = nb, metrics
        super().__init__(layer_opt, nb, end_lr, linear, metrics)
        self.stop_dv = stop_dv

    def on_batch_end(self, loss):
        if self.iteration == self.nb:
            return True
        return super().on_batch_end(loss)

    def plot(self, n_skip=10, n_skip_end=5, smoothed=True):
        if self.metrics is None: self.metrics = []
        n_plots = len(self.metrics)+2
        fig, axs = plt.subplots(n_plots,figsize=(6,4*n_plots))
        for i in range(0,n_plots): axs[i].set_xlabel('learning rate')
        axs[0].set_ylabel('training loss')
        axs[1].set_ylabel('validation loss')
        for i,m in enumerate(self.metrics): 
            axs[i+2].set_ylabel(m.__name__)
            if len(self.metrics) == 1:
                values = self.rec_metrics
            else:
                values = [rec[i] for rec in self.rec_metrics]
            if smoothed: values = smooth_curve(values,0.98)
            axs[i+2].plot(self.lrs[n_skip:-n_skip_end], values[n_skip:-n_skip_end])
        plt_val_l = smooth_curve(self.val_losses, 0.98) if smoothed else self.val_losses
        axs[0].plot(self.lrs[n_skip:-n_skip_end],self.losses[n_skip:-n_skip_end])
        axs[1].plot(self.lrs[n_skip:-n_skip_end],plt_val_l[n_skip:-n_skip_end])

class CosAnneal(LR_Updater):
    ''' Learning rate scheduler that implements a cosine annealation schedule. '''
    def __init__(self, layer_opt, nb, on_cycle_end=None, cycle_mult=1):
        self.nb,self.on_cycle_end,self.cycle_mult = nb,on_cycle_end,cycle_mult
        super().__init__(layer_opt)

    def on_train_begin(self):
        self.cycle_iter,self.cycle_count=0,0
        super().on_train_begin()

    def calc_lr(self, init_lrs):
        if self.iteration<self.nb/20:
            self.cycle_iter += 1
            return init_lrs/100.

        cos_out = np.cos(np.pi*(self.cycle_iter)/self.nb) + 1
        self.cycle_iter += 1
        if self.cycle_iter==self.nb:
            self.cycle_iter = 0
            self.nb *= self.cycle_mult
            if self.on_cycle_end: self.on_cycle_end(self, self.cycle_count)
            self.cycle_count += 1
        return init_lrs / 2 * cos_out


class CircularLR(LR_Updater):
    '''
    A learning rate updater that implements the CircularLearningRate (CLR) scheme. 
    Learning rate is increased then decreased linearly. 
    '''
    def __init__(self, layer_opt, nb, div=4, cut_div=8, on_cycle_end=None, momentums=None):
        self.nb,self.div,self.cut_div,self.on_cycle_end = nb,div,cut_div,on_cycle_end
        if momentums is not None:
            self.moms = momentums
        super().__init__(layer_opt, record_mom=(momentums is not None))

    def on_train_begin(self):
        self.cycle_iter,self.cycle_count=0,0
        super().on_train_begin()

    def calc_lr(self, init_lrs):
        cut_pt = self.nb//self.cut_div
        if self.cycle_iter>cut_pt:
            pct = 1 - (self.cycle_iter - cut_pt)/(self.nb - cut_pt)
        else: pct = self.cycle_iter/cut_pt
        res = init_lrs * (1 + pct*(self.div-1)) / self.div
        self.cycle_iter += 1
        if self.cycle_iter==self.nb:
            self.cycle_iter = 0
            if self.on_cycle_end: self.on_cycle_end(self, self.cycle_count)
            self.cycle_count += 1
        return res
    
    def calc_mom(self):
        cut_pt = self.nb//self.cut_div
        if self.cycle_iter>cut_pt:
            pct = (self.cycle_iter - cut_pt)/(self.nb - cut_pt)
        else: pct = 1 - self.cycle_iter/cut_pt
        res = self.moms[1] + pct * (self.moms[0] - self.moms[1])
        return res

class CircularLR_beta(LR_Updater):
    def __init__(self, layer_opt, nb, div=10, pct=10, on_cycle_end=None, momentums=None):
        self.nb,self.div,self.pct,self.on_cycle_end = nb,div,pct,on_cycle_end
        self.cycle_nb = int(nb * (1-pct/100) / 2)
        if momentums is not None:
            self.moms = momentums
        super().__init__(layer_opt, record_mom=(momentums is not None))

    def on_train_begin(self):
        self.cycle_iter,self.cycle_count=0,0
        super().on_train_begin()

    def calc_lr(self, init_lrs):
        if self.cycle_iter>2 * self.cycle_nb:
            pct = (self.cycle_iter - 2*self.cycle_nb)/(self.nb - 2*self.cycle_nb)
            res = init_lrs * (1 + (pct * (1-100)/100)) / self.div
        elif self.cycle_iter>self.cycle_nb:
            pct = 1 - (self.cycle_iter - self.cycle_nb)/self.cycle_nb
            res = init_lrs * (1 + pct*(self.div-1)) / self.div
        else:
            pct = self.cycle_iter/self.cycle_nb
            res = init_lrs * (1 + pct*(self.div-1)) / self.div
        self.cycle_iter += 1
        if self.cycle_iter==self.nb:
            self.cycle_iter = 0
            if self.on_cycle_end: self.on_cycle_end(self, self.cycle_count)
            self.cycle_count += 1
        return res

    def calc_mom(self):
        if self.cycle_iter>2*self.cycle_nb:
            res = self.moms[0]
        elif self.cycle_iter>self.cycle_nb:
            pct = 1 - (self.cycle_iter - self.cycle_nb)/self.cycle_nb
            res = self.moms[0] + pct * (self.moms[1] - self.moms[0])
        else:
            pct = self.cycle_iter/self.cycle_nb
            res = self.moms[0] + pct * (self.moms[1] - self.moms[0])
        return res


class SaveBestModel(LossRecorder):
    
    """ Save weights of the best model based during training.
        If metrics are provided, the first metric in the list is used to
        find the best model. 
        If no metrics are provided, the loss is used.
        
        Args:
            model: the fastai model
            lr: indicate to use test images; otherwise use validation images
            name: the name of filename of the weights without '.h5'
        
        Usage:
            Briefly, you have your model 'learn' variable and call fit.
            >>> learn.fit(lr, 2, cycle_len=2, cycle_mult=1, best_save_name='mybestmodel')
            ....
            >>> learn.load('mybestmodel')
            
            For more details see http://forums.fast.ai/t/a-code-snippet-to-save-the-best-model-during-training/12066
 
    """
    def __init__(self, model, layer_opt, metrics, name='best_model'):
        super().__init__(layer_opt)
        self.name = name
        self.model = model
        self.best_loss = None
        self.best_acc = None
        self.save_method = self.save_when_only_loss if metrics==None else self.save_when_acc
        
    def save_when_only_loss(self, metrics):
        loss = metrics[0]
        if self.best_loss == None or loss < self.best_loss:
            self.best_loss = loss
            self.model.save(f'{self.name}')
    
    def save_when_acc(self, metrics):
        loss, acc = metrics[0], metrics[1]
        if self.best_acc == None or acc > self.best_acc:
            self.best_acc = acc
            self.best_loss = loss
            self.model.save(f'{self.name}')
        elif acc == self.best_acc and  loss < self.best_loss:
            self.best_loss = loss
            self.model.save(f'{self.name}')
        
    def on_epoch_end(self, metrics):
        super().on_epoch_end(metrics)
        self.save_method(metrics)


class WeightDecaySchedule(Callback):
    def __init__(self, layer_opt, batch_per_epoch, cycle_len, cycle_mult, n_cycles, norm_wds=False, wds_sched_mult=None):
        """
        Implements the weight decay schedule as mentioned in https://arxiv.org/abs/1711.05101

        :param layer_opt: The LayerOptimizer
        :param batch_per_epoch: Num batches in 1 epoch
        :param cycle_len: Num epochs in initial cycle. Subsequent cycle_len = previous cycle_len * cycle_mult
        :param cycle_mult: Cycle multiplier
        :param n_cycles: Number of cycles to be executed
        """
        super().__init__()

        self.layer_opt = layer_opt
        self.batch_per_epoch = batch_per_epoch
        self.init_wds = np.array(layer_opt.wds)  # Weights as set by user
        self.init_lrs = np.array(layer_opt.lrs)  # Learning rates as set by user
        self.new_wds = None                      # Holds the new weight decay factors, calculated in on_batch_begin()
        self.iteration = 0
        self.epoch = 0
        self.wds_sched_mult = wds_sched_mult
        self.norm_wds = norm_wds
        self.wds_history = list()

        # Pre calculating the number of epochs in the cycle of current running epoch
        self.epoch_to_num_cycles, i = dict(), 0
        for cycle in range(n_cycles):
            for _ in range(cycle_len):
                self.epoch_to_num_cycles[i] = cycle_len
                i += 1
            cycle_len *= cycle_mult

    def on_train_begin(self):
        self.iteration = 0
        self.epoch = 0

    def on_batch_begin(self):
        # Prepare for decay of weights

        # Default weight decay (as provided by user)
        wdn = self.init_wds

        # Weight decay multiplier (The 'eta' in the paper). Optional.
        wdm = 1.0
        if self.wds_sched_mult is not None:
            wdm = self.wds_sched_mult(self)

        # Weight decay normalized. Optional.
        if self.norm_wds:
            wdn = wdn / np.sqrt(self.batch_per_epoch * self.epoch_to_num_cycles[self.epoch])

        # Final wds
        self.new_wds = wdm * wdn

        # Set weight_decay with zeros so that it is not applied in Adam, we will apply it outside in on_batch_end()
        self.layer_opt.set_wds_out(self.new_wds)
        # We have to save the existing weights before the optimizer changes the values
        self.iteration += 1

    def on_epoch_end(self, metrics):
        self.epoch += 1

class DecayType(IntEnum):
    ''' Data class, each decay type is assigned a number. '''
    NO = 1
    LINEAR = 2
    COSINE = 3
    EXPONENTIAL = 4
    POLYNOMIAL = 5

class DecayScheduler():
    '''Given initial and endvalue, this class generates the next value depending on decay type and number of iterations. (by calling next_val().) '''

    def __init__(self, dec_type, num_it, start_val, end_val=None, extra=None):
        self.dec_type, self.nb, self.start_val, self.end_val, self.extra = dec_type, num_it, start_val, end_val, extra
        self.it = 0
        if self.end_val is None and not (self.dec_type in [1,4]): self.end_val = 0
    
    def next_val(self):
        self.it += 1
        if self.dec_type == DecayType.NO:
            return self.start_val
        elif self.dec_type == DecayType.LINEAR:
            pct = self.it/self.nb
            return self.start_val + pct * (self.end_val-self.start_val)
        elif self.dec_type == DecayType.COSINE:
            cos_out = np.cos(np.pi*(self.it)/self.nb) + 1
            return self.end_val + (self.start_val-self.end_val) / 2 * cos_out
        elif self.dec_type == DecayType.EXPONENTIAL:
            ratio = self.end_val / self.start_val
            return self.start_val * (ratio **  (self.it/self.nb))
        elif self.dec_type == DecayType.POLYNOMIAL:
            return self.end_val + (self.start_val-self.end_val) * (1 - self.it/self.nb)**self.extra
        

class TrainingPhase():
    '''
    Object with training information for each phase, when multiple phases are involved during training.  
    Used in fit_opt_sched in learner.py
    '''
    def __init__(self, epochs=1, opt_fn=optim.SGD, lr=1e-2, lr_decay=DecayType.NO, momentum=0.9,
                momentum_decay=DecayType.NO, beta=None, wds=None, wd_loss=True):
        """
        Creates an object containing all the relevant informations for one part of a model training.

        Args
        epochs: number of epochs to train like this
        opt_fn: an optimizer (example optim.Adam)
        lr: one learning rate or a tuple of the form (start_lr,end_lr)
          each of those can be a list/numpy array for differential learning rates
        lr_decay: a DecayType object specifying how the learning rate should change
        momentum: one momentum (or beta1 in case of Adam), or a tuple of the form (start_mom,end_mom)
        momentum_decay: a DecayType object specifying how the momentum should change
        beta: beta2 parameter of Adam or alpha parameter of RMSProp
        wds: weight decay (can be an array for differential wds)
        """
        self.epochs, self.opt_fn, self.lr, self.momentum, self.beta, self.wds = epochs, opt_fn, lr, momentum, beta, wds
        if isinstance(lr_decay,tuple): self.lr_decay, self.extra_lr = lr_decay
        else: self.lr_decay, self.extra_lr = lr_decay, None
        if isinstance(momentum_decay,tuple): self.mom_decay, self.extra_mom = momentum_decay
        else: self.mom_decay, self.extra_mom = momentum_decay, None
        self.wd_loss = wd_loss

    def phase_begin(self, layer_opt, nb_batches):
        self.layer_opt = layer_opt
        if isinstance(self.lr, tuple): start_lr,end_lr = self.lr
        else: start_lr, end_lr = self.lr, None
        self.lr_sched = DecayScheduler(self.lr_decay, nb_batches * self.epochs, start_lr, end_lr, extra=self.extra_lr)
        if isinstance(self.momentum, tuple): start_mom,end_mom = self.momentum
        else: start_mom, end_mom = self.momentum, None
        self.mom_sched = DecayScheduler(self.mom_decay, nb_batches * self.epochs, start_mom, end_mom, extra=self.extra_mom)
        self.layer_opt.set_opt_fn(self.opt_fn)
        self.layer_opt.set_lrs(start_lr)
        self.layer_opt.set_mom(start_mom)
        if self.beta is not None: self.layer_opt.set_beta(self.beta)
        if self.wds is not None:
            if not isinstance(self.wds, Iterable): self.wds=[self.wds]
            if len(self.wds)==1: self.wds=self.wds*len(self.layer_opt.layer_groups) 
            if self.wd_loss: self.layer_opt.set_wds(self.wds)
            else: self.layer_opt.set_wds_out(self.wds)
    
    def update(self):
        new_lr, new_mom = self.lr_sched.next_val(), self.mom_sched.next_val()
        self.layer_opt.set_lrs(new_lr)
        self.layer_opt.set_mom(new_mom)
    

class OptimScheduler(LossRecorder):
    '''Learning rate Scheduler for training involving multiple phases.'''

    def __init__(self, layer_opt, phases, nb_batches, stop_div = False):
        self.phases, self.nb_batches, self.stop_div = phases, nb_batches, stop_div
        super().__init__(layer_opt, record_mom=True)

    def on_train_begin(self):
        super().on_train_begin()
        self.phase,self.best=0,1e9

    def on_batch_end(self, metrics):
        loss = metrics[0] if isinstance(metrics,list) else metrics
        if self.stop_div and (math.isnan(loss) or loss>self.best*4):
            return True
        if (loss<self.best and self.iteration>10): self.best=loss
        super().on_batch_end(metrics)
        self.phases[self.phase].update()
    
    def on_phase_begin(self):
        self.phases[self.phase].phase_begin(self.layer_opt, self.nb_batches[self.phase])

    def on_phase_end(self):
        self.phase += 1

    def plot_lr(self, show_text=True, show_moms=True):
        """Plots the lr rate/momentum schedule"""
        phase_limits = [0]
        for nb_batch, phase in zip(self.nb_batches, self.phases):
            phase_limits.append(phase_limits[-1] + nb_batch * phase.epochs)
        if not in_ipynb():
            plt.switch_backend('agg')
        np_plts = 2 if show_moms else 1
        fig, axs = plt.subplots(1,np_plts,figsize=(6*np_plts,4))
        if not show_moms: axs = [axs]
        for i in range(np_plts): axs[i].set_xlabel('iterations')
        axs[0].set_ylabel('learning rate')
        axs[0].plot(self.iterations,self.lrs)
        if show_moms:
            axs[1].set_ylabel('momentum')
            axs[1].plot(self.iterations,self.momentums)
        if show_text:   
            for i, phase in enumerate(self.phases):
                text = phase.opt_fn.__name__
                if phase.wds is not None: text+='\nwds='+str(phase.wds)
                if phase.beta is not None: text+='\nbeta='+str(phase.beta)
                for k in range(np_plts):
                    if i < len(self.phases)-1:
                        draw_line(axs[k], phase_limits[i+1])
                    draw_text(axs[k], (phase_limits[i]+phase_limits[i+1])/2, text) 
        if not in_ipynb():
            plt.savefig(os.path.join(self.save_path, 'lr_plot.png'))
    
    def plot(self, n_skip=10, n_skip_end=5, linear=None):
        if linear is None: linear = self.phases[-1].lr_decay == DecayType.LINEAR
        plt.ylabel("loss")
        plt.plot(self.lrs[n_skip:-n_skip_end], self.losses[n_skip:-n_skip_end])
        if linear: plt.xlabel("learning rate")
        else:
            plt.xlabel("learning rate (log scale)")
            plt.xscale('log')

def draw_line(ax,x):
    xmin, xmax, ymin, ymax = ax.axis()
    ax.plot([x,x],[ymin,ymax], color='red', linestyle='dashed')

def draw_text(ax,x, text):
    xmin, xmax, ymin, ymax = ax.axis()
    ax.text(x,(ymin+ymax)/2,text, horizontalalignment='center', verticalalignment='center', fontsize=14, alpha=0.5)

def smooth_curve(vals, beta):
    avg_val = 0
    smoothed = []
    for (i,v) in enumerate(vals):
        avg_val = beta * avg_val + (1-beta) * v
        smoothed.append(avg_val/(1-beta**(i+1)))
    return smoothed