123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529 |
- import csv
- from .imports import *
- from .torch_imports import *
- from .core import *
- from .transforms import *
- from .layer_optimizer import *
- from .dataloader import DataLoader
- def get_cv_idxs(n, cv_idx=0, val_pct=0.2, seed=42):
- """ Get a list of index values for Validation set from a dataset
-
- Arguments:
- n : int, Total number of elements in the data set.
- cv_idx : int, starting index [idx_start = cv_idx*int(val_pct*n)]
- val_pct : (int, float), validation set percentage
- seed : seed value for RandomState
-
- Returns:
- list of indexes
- """
- np.random.seed(seed)
- n_val = int(val_pct*n)
- idx_start = cv_idx*n_val
- idxs = np.random.permutation(n)
- return idxs[idx_start:idx_start+n_val]
- def resize_img(fname, targ, path, new_path):
- """
- Enlarge or shrink a single image to scale, such that the smaller of the height or width dimension is equal to targ.
- """
- dest = os.path.join(path,new_path,str(targ),fname)
- if os.path.exists(dest): return
- im = Image.open(os.path.join(path, fname)).convert('RGB')
- r,c = im.size
- ratio = targ/min(r,c)
- sz = (scale_to(r, ratio, targ), scale_to(c, ratio, targ))
- os.makedirs(os.path.split(dest)[0], exist_ok=True)
- im.resize(sz, Image.LINEAR).save(dest)
- def resize_imgs(fnames, targ, path, new_path):
- """
- Enlarge or shrink a set of images in the same directory to scale, such that the smaller of the height or width dimension is equal to targ.
- Note:
- -- This function is multithreaded for efficiency.
- -- When destination file or folder already exist, function exists without raising an error.
- """
- if not os.path.exists(os.path.join(path,new_path,str(targ),fnames[0])):
- with ThreadPoolExecutor(8) as e:
- ims = e.map(lambda x: resize_img(x, targ, path, new_path), fnames)
- for x in tqdm(ims, total=len(fnames), leave=False): pass
- return os.path.join(path,new_path,str(targ))
- def read_dir(path, folder):
- """ Returns a list of relative file paths to `path` for all files within `folder` """
- full_path = os.path.join(path, folder)
- fnames = glob(f"{full_path}/*.*")
- directories = glob(f"{full_path}/*/")
- if any(fnames):
- return [os.path.relpath(f,path) for f in fnames]
- elif any(directories):
- raise FileNotFoundError("{} has subdirectories but contains no files. Is your directory structure is correct?".format(full_path))
- else:
- raise FileNotFoundError("{} folder doesn't exist or is empty".format(full_path))
- def read_dirs(path, folder):
- '''
- Fetches name of all files in path in long form, and labels associated by extrapolation of directory names.
- '''
- lbls, fnames, all_lbls = [], [], []
- full_path = os.path.join(path, folder)
- for lbl in sorted(os.listdir(full_path)):
- if lbl not in ('.ipynb_checkpoints','.DS_Store'):
- all_lbls.append(lbl)
- for fname in os.listdir(os.path.join(full_path, lbl)):
- if fname not in ('.DS_Store'):
- fnames.append(os.path.join(folder, lbl, fname))
- lbls.append(lbl)
- return fnames, lbls, all_lbls
- def n_hot(ids, c):
- '''
- one hot encoding by index. Returns array of length c, where all entries are 0, except for the indecies in ids
- '''
- res = np.zeros((c,), dtype=np.float32)
- res[ids] = 1
- return res
- def folder_source(path, folder):
- """
- Returns the filenames and labels for a folder within a path
-
- Returns:
- -------
- fnames: a list of the filenames within `folder`
- all_lbls: a list of all of the labels in `folder`, where the # of labels is determined by the # of directories within `folder`
- lbl_arr: a numpy array of the label indices in `all_lbls`
- """
- fnames, lbls, all_lbls = read_dirs(path, folder)
- lbl2idx = {lbl:idx for idx,lbl in enumerate(all_lbls)}
- idxs = [lbl2idx[lbl] for lbl in lbls]
- lbl_arr = np.array(idxs, dtype=int)
- return fnames, lbl_arr, all_lbls
- def parse_csv_labels(fn, skip_header=True, cat_separator = ' '):
- """Parse filenames and label sets from a CSV file.
- This method expects that the csv file at path :fn: has two columns. If it
- has a header, :skip_header: should be set to True. The labels in the
- label set are expected to be space separated.
- Arguments:
- fn: Path to a CSV file.
- skip_header: A boolean flag indicating whether to skip the header.
- Returns:
- a two-tuple of (
- image filenames,
- a dictionary of filenames and corresponding labels
- )
- .
- :param cat_separator: the separator for the categories column
- """
- df = pd.read_csv(fn, index_col=0, header=0 if skip_header else None, dtype=str)
- fnames = df.index.values
- df.iloc[:,0] = df.iloc[:,0].str.split(cat_separator)
- return fnames, list(df.to_dict().values())[0]
- def nhot_labels(label2idx, csv_labels, fnames, c):
-
- all_idx = {k: n_hot([label2idx[o] for o in ([] if type(v) == float else v)], c)
- for k,v in csv_labels.items()}
- return np.stack([all_idx[o] for o in fnames])
- def csv_source(folder, csv_file, skip_header=True, suffix='', continuous=False, cat_separator=' '):
- fnames,csv_labels = parse_csv_labels(csv_file, skip_header, cat_separator)
- return dict_source(folder, fnames, csv_labels, suffix, continuous)
- def dict_source(folder, fnames, csv_labels, suffix='', continuous=False):
- all_labels = sorted(list(set(p for o in csv_labels.values() for p in ([] if type(o) == float else o))))
- full_names = [os.path.join(folder,str(fn)+suffix) for fn in fnames]
- if continuous:
- label_arr = np.array([np.array(csv_labels[i]).astype(np.float32)
- for i in fnames])
- else:
- label2idx = {v:k for k,v in enumerate(all_labels)}
- label_arr = nhot_labels(label2idx, csv_labels, fnames, len(all_labels))
- is_single = np.all(label_arr.sum(axis=1)==1)
- if is_single: label_arr = np.argmax(label_arr, axis=1)
- return full_names, label_arr, all_labels
- class BaseDataset(Dataset):
- """An abstract class representing a fastai dataset. Extends torch.utils.data.Dataset."""
- def __init__(self, transform=None):
- self.transform = transform
- self.n = self.get_n()
- self.c = self.get_c()
- self.sz = self.get_sz()
- def get1item(self, idx):
- x,y = self.get_x(idx),self.get_y(idx)
- return self.get(self.transform, x, y)
- def __getitem__(self, idx):
- if isinstance(idx,slice):
- xs,ys = zip(*[self.get1item(i) for i in range(*idx.indices(self.n))])
- return np.stack(xs),ys
- return self.get1item(idx)
- def __len__(self): return self.n
- def get(self, tfm, x, y):
- return (x,y) if tfm is None else tfm(x,y)
- @abstractmethod
- def get_n(self):
- """Return number of elements in the dataset == len(self)."""
- raise NotImplementedError
- @abstractmethod
- def get_c(self):
- """Return number of classes in a dataset."""
- raise NotImplementedError
- @abstractmethod
- def get_sz(self):
- """Return maximum size of an image in a dataset."""
- raise NotImplementedError
- @abstractmethod
- def get_x(self, i):
- """Return i-th example (image, wav, etc)."""
- raise NotImplementedError
- @abstractmethod
- def get_y(self, i):
- """Return i-th label."""
- raise NotImplementedError
- @property
- def is_multi(self):
- """Returns true if this data set contains multiple labels per sample."""
- return False
- @property
- def is_reg(self):
- """True if the data set is used to train regression models."""
- return False
- def open_image(fn):
- """ Opens an image using OpenCV given the file path.
- Arguments:
- fn: the file path of the image
- Returns:
- The image in RGB format as numpy array of floats normalized to range between 0.0 - 1.0
- """
- flags = cv2.IMREAD_UNCHANGED+cv2.IMREAD_ANYDEPTH+cv2.IMREAD_ANYCOLOR
- if not os.path.exists(fn) and not str(fn).startswith("http"):
- raise OSError('No such file or directory: {}'.format(fn))
- elif os.path.isdir(fn) and not str(fn).startswith("http"):
- raise OSError('Is a directory: {}'.format(fn))
- else:
- #res = np.array(Image.open(fn), dtype=np.float32)/255
- #if len(res.shape)==2: res = np.repeat(res[...,None],3,2)
- #return res
- try:
- if str(fn).startswith("http"):
- req = urllib.urlopen(str(fn))
- image = np.asarray(bytearray(req.read()), dtype="uint8")
- im = cv2.imdecode(image, flags).astype(np.float32)/255
- else:
- im = cv2.imread(str(fn), flags).astype(np.float32)/255
- if im is None: raise OSError(f'File not recognized by opencv: {fn}')
- return cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
- except Exception as e:
- raise OSError('Error handling image at: {}'.format(fn)) from e
- class FilesDataset(BaseDataset):
- def __init__(self, fnames, transform, path):
- self.path,self.fnames = path,fnames
- super().__init__(transform)
- def get_sz(self): return self.transform.sz
- def get_x(self, i): return open_image(os.path.join(self.path, self.fnames[i]))
- def get_n(self): return len(self.fnames)
- def resize_imgs(self, targ, new_path):
- dest = resize_imgs(self.fnames, targ, self.path, new_path)
- return self.__class__(self.fnames, self.y, self.transform, dest)
- def denorm(self,arr):
- """Reverse the normalization done to a batch of images.
- Arguments:
- arr: of shape/size (N,3,sz,sz)
- """
- if type(arr) is not np.ndarray: arr = to_np(arr)
- if len(arr.shape)==3: arr = arr[None]
- return self.transform.denorm(np.rollaxis(arr,1,4))
- class FilesArrayDataset(FilesDataset):
- def __init__(self, fnames, y, transform, path):
- self.y=y
- assert(len(fnames)==len(y))
- super().__init__(fnames, transform, path)
- def get_y(self, i): return self.y[i]
- def get_c(self):
- return self.y.shape[1] if len(self.y.shape)>1 else 0
- class FilesIndexArrayDataset(FilesArrayDataset):
- def get_c(self): return int(self.y.max())+1
- class FilesNhotArrayDataset(FilesArrayDataset):
- @property
- def is_multi(self): return True
- class FilesIndexArrayRegressionDataset(FilesArrayDataset):
- def is_reg(self): return True
- class ArraysDataset(BaseDataset):
- def __init__(self, x, y, transform):
- self.x,self.y=x,y
- assert(len(x)==len(y))
- super().__init__(transform)
- def get_x(self, i): return self.x[i]
- def get_y(self, i): return self.y[i]
- def get_n(self): return len(self.y)
- def get_sz(self): return self.x.shape[1]
- class ArraysIndexDataset(ArraysDataset):
- def get_c(self): return int(self.y.max())+1
- def get_y(self, i): return self.y[i]
- class ArraysIndexRegressionDataset(ArraysIndexDataset):
- def is_reg(self): return True
-
-
- class ArraysNhotDataset(ArraysDataset):
- def get_c(self): return self.y.shape[1]
- @property
- def is_multi(self): return True
- class ModelData():
- """Encapsulates DataLoaders and Datasets for training, validation, test. Base class for fastai *Data classes."""
- def __init__(self, path, trn_dl, val_dl, test_dl=None):
- self.path,self.trn_dl,self.val_dl,self.test_dl = path,trn_dl,val_dl,test_dl
- @classmethod
- def from_dls(cls, path,trn_dl,val_dl,test_dl=None):
- #trn_dl,val_dl = DataLoader(trn_dl),DataLoader(val_dl)
- #if test_dl: test_dl = DataLoader(test_dl)
- return cls(path, trn_dl, val_dl, test_dl)
- @property
- def is_reg(self): return self.trn_ds.is_reg
- @property
- def is_multi(self): return self.trn_ds.is_multi
- @property
- def trn_ds(self): return self.trn_dl.dataset
- @property
- def val_ds(self): return self.val_dl.dataset
- @property
- def test_ds(self): return self.test_dl.dataset
- @property
- def trn_y(self): return self.trn_ds.y
- @property
- def val_y(self): return self.val_ds.y
- class ImageData(ModelData):
- def __init__(self, path, datasets, bs, num_workers, classes):
- trn_ds,val_ds,fix_ds,aug_ds,test_ds,test_aug_ds = datasets
- self.path,self.bs,self.num_workers,self.classes = path,bs,num_workers,classes
- self.trn_dl,self.val_dl,self.fix_dl,self.aug_dl,self.test_dl,self.test_aug_dl = [
- self.get_dl(ds,shuf) for ds,shuf in [
- (trn_ds,True),(val_ds,False),(fix_ds,False),(aug_ds,False),
- (test_ds,False),(test_aug_ds,False)
- ]
- ]
- def get_dl(self, ds, shuffle):
- if ds is None: return None
- return DataLoader(ds, batch_size=self.bs, shuffle=shuffle,
- num_workers=self.num_workers, pin_memory=False)
- @property
- def sz(self): return self.trn_ds.sz
- @property
- def c(self): return self.trn_ds.c
- def resized(self, dl, targ, new_path):
- return dl.dataset.resize_imgs(targ,new_path) if dl else None
- def resize(self, targ_sz, new_path='tmp'):
- new_ds = []
- dls = [self.trn_dl,self.val_dl,self.fix_dl,self.aug_dl]
- if self.test_dl: dls += [self.test_dl, self.test_aug_dl]
- else: dls += [None,None]
- t = tqdm_notebook(dls)
- for dl in t: new_ds.append(self.resized(dl, targ_sz, new_path))
- t.close()
- return self.__class__(new_ds[0].path, new_ds, self.bs, self.num_workers, self.classes)
- @staticmethod
- def get_ds(fn, trn, val, tfms, test=None, **kwargs):
- res = [
- fn(trn[0], trn[1], tfms[0], **kwargs), # train
- fn(val[0], val[1], tfms[1], **kwargs), # val
- fn(trn[0], trn[1], tfms[1], **kwargs), # fix
- fn(val[0], val[1], tfms[0], **kwargs) # aug
- ]
- if test is not None:
- if isinstance(test, tuple):
- test_lbls = test[1]
- test = test[0]
- else:
- if len(trn[1].shape) == 1:
- test_lbls = np.zeros((len(test),1))
- else:
- test_lbls = np.zeros((len(test),trn[1].shape[1]))
- res += [
- fn(test, test_lbls, tfms[1], **kwargs), # test
- fn(test, test_lbls, tfms[0], **kwargs) # test_aug
- ]
- else: res += [None,None]
- return res
- class ImageClassifierData(ImageData):
- @classmethod
- def from_arrays(cls, path, trn, val, bs=64, tfms=(None,None), classes=None, num_workers=4, test=None, continuous=False):
- """ Read in images and their labels given as numpy arrays
- Arguments:
- path: a root path of the data (used for storing trained models, precomputed values, etc)
- trn: a tuple of training data matrix and target label/classification array (e.g. `trn=(x,y)` where `x` has the
- shape of `(5000, 784)` and `y` has the shape of `(5000,)`)
- val: a tuple of validation data matrix and target label/classification array.
- bs: batch size
- tfms: transformations (for data augmentations). e.g. output of `tfms_from_model`
- classes: a list of all labels/classifications
- num_workers: a number of workers
- test: a matrix of test data (the shape should match `trn[0]`)
- Returns:
- ImageClassifierData
- """
- f = ArraysIndexRegressionDataset if continuous else ArraysIndexDataset
- datasets = cls.get_ds(f, trn, val, tfms, test=test)
- return cls(path, datasets, bs, num_workers, classes=classes)
- @classmethod
- def from_paths(cls, path, bs=64, tfms=(None,None), trn_name='train', val_name='valid', test_name=None, test_with_labels=False, num_workers=8):
- """ Read in images and their labels given as sub-folder names
- Arguments:
- path: a root path of the data (used for storing trained models, precomputed values, etc)
- bs: batch size
- tfms: transformations (for data augmentations). e.g. output of `tfms_from_model`
- trn_name: a name of the folder that contains training images.
- val_name: a name of the folder that contains validation images.
- test_name: a name of the folder that contains test images.
- num_workers: number of workers
- Returns:
- ImageClassifierData
- """
- assert not(tfms[0] is None or tfms[1] is None), "please provide transformations for your train and validation sets"
- trn,val = [folder_source(path, o) for o in (trn_name, val_name)]
- if test_name:
- test = folder_source(path, test_name) if test_with_labels else read_dir(path, test_name)
- else: test = None
- datasets = cls.get_ds(FilesIndexArrayDataset, trn, val, tfms, path=path, test=test)
- return cls(path, datasets, bs, num_workers, classes=trn[2])
- @classmethod
- def from_csv(cls, path, folder, csv_fname, bs=64, tfms=(None,None),
- val_idxs=None, suffix='', test_name=None, continuous=False, skip_header=True, num_workers=8, cat_separator=' '):
- """ Read in images and their labels given as a CSV file.
- This method should be used when training image labels are given in an CSV file as opposed to
- sub-directories with label names.
- Arguments:
- path: a root path of the data (used for storing trained models, precomputed values, etc)
- folder: a name of the folder in which training images are contained.
- csv_fname: a name of the CSV file which contains target labels.
- bs: batch size
- tfms: transformations (for data augmentations). e.g. output of `tfms_from_model`
- val_idxs: index of images to be used for validation. e.g. output of `get_cv_idxs`.
- If None, default arguments to get_cv_idxs are used.
- suffix: suffix to add to image names in CSV file (sometimes CSV only contains the file name without file
- extension e.g. '.jpg' - in which case, you can set suffix as '.jpg')
- test_name: a name of the folder which contains test images.
- continuous: TODO
- skip_header: skip the first row of the CSV file.
- num_workers: number of workers
- cat_separator: Labels category separator
- Returns:
- ImageClassifierData
- """
- assert not (tfms[0] is None or tfms[1] is None), "please provide transformations for your train and validation sets"
- assert not (os.path.isabs(folder)), "folder needs to be a relative path"
- fnames,y,classes = csv_source(folder, csv_fname, skip_header, suffix, continuous=continuous, cat_separator=cat_separator)
- return cls.from_names_and_array(path, fnames, y, classes, val_idxs, test_name,
- num_workers=num_workers, suffix=suffix, tfms=tfms, bs=bs, continuous=continuous)
- @classmethod
- def from_path_and_array(cls, path, folder, y, classes=None, val_idxs=None, test_name=None,
- num_workers=8, tfms=(None,None), bs=64):
- """ Read in images given a sub-folder and their labels given a numpy array
- Arguments:
- path: a root path of the data (used for storing trained models, precomputed values, etc)
- folder: a name of the folder in which training images are contained.
- y: numpy array which contains target labels ordered by filenames.
- bs: batch size
- tfms: transformations (for data augmentations). e.g. output of `tfms_from_model`
- val_idxs: index of images to be used for validation. e.g. output of `get_cv_idxs`.
- If None, default arguments to get_cv_idxs are used.
- test_name: a name of the folder which contains test images.
- num_workers: number of workers
- Returns:
- ImageClassifierData
- """
- assert not (tfms[0] is None or tfms[1] is None), "please provide transformations for your train and validation sets"
- assert not (os.path.isabs(folder)), "folder needs to be a relative path"
- fnames = np.core.defchararray.add(f'{folder}/', sorted(os.listdir(f'{path}{folder}')))
- return cls.from_names_and_array(path, fnames, y, classes, val_idxs, test_name,
- num_workers=num_workers, tfms=tfms, bs=bs)
- @classmethod
- def from_names_and_array(cls, path, fnames, y, classes, val_idxs=None, test_name=None,
- num_workers=8, suffix='', tfms=(None,None), bs=64, continuous=False):
- val_idxs = get_cv_idxs(len(fnames)) if val_idxs is None else val_idxs
- ((val_fnames,trn_fnames),(val_y,trn_y)) = split_by_idx(val_idxs, np.array(fnames), y)
- test_fnames = read_dir(path, test_name) if test_name else None
- if continuous: f = FilesIndexArrayRegressionDataset
- else:
- f = FilesIndexArrayDataset if len(trn_y.shape)==1 else FilesNhotArrayDataset
- datasets = cls.get_ds(f, (trn_fnames,trn_y), (val_fnames,val_y), tfms,
- path=path, test=test_fnames)
- return cls(path, datasets, bs, num_workers, classes=classes)
- def split_by_idx(idxs, *a):
- """
- Split each array passed as *a, to a pair of arrays like this (elements selected by idxs, the remaining elements)
- This can be used to split multiple arrays containing training data to validation and training set.
- :param idxs [int]: list of indexes selected
- :param a list: list of np.array, each array should have same amount of elements in the first dimension
- :return: list of tuples, each containing a split of corresponding array from *a.
- First element of each tuple is an array composed from elements selected by idxs,
- second element is an array of remaining elements.
- """
- mask = np.zeros(len(a[0]),dtype=bool)
- mask[np.array(idxs)] = True
- return [(o[mask],o[~mask]) for o in a]
|