Skip to content

anomaly

AnomalyDataset(transform, samples, task='segmentation', valid_area_mask=None, crop_area=None)

Bases: Dataset

Anomaly Dataset.

Parameters:

  • transform (Compose) –

    Albumentations compose.

  • task (str, default: 'segmentation' ) –

    classification or segmentation

  • samples (DataFrame) –

    Pandas dataframe containing samples following the same structure created by make_anomaly_dataset

  • valid_area_mask (Optional[str], default: None ) –

    Optional path to the mask to use to filter out the valid area of the image. If None, the whole image is considered valid.

  • crop_area (Optional[Tuple[int, int, int, int]], default: None ) –

    Optional tuple of 4 integers (x1, y1, x2, y2) to crop the image to the specified area. If None, the whole image is considered valid.

Source code in quadra/datasets/anomaly.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
def __init__(
    self,
    transform: alb.Compose,
    samples: DataFrame,
    task: str = "segmentation",
    valid_area_mask: Optional[str] = None,
    crop_area: Optional[Tuple[int, int, int, int]] = None,
) -> None:
    self.task = task
    self.transform = transform

    self.samples = samples
    self.samples = self.samples.reset_index(drop=True)
    self.split = self.samples.split.unique()[0]

    self.crop_area = crop_area
    self.valid_area_mask: Optional[np.ndarray] = None

    if valid_area_mask is not None:
        if not os.path.exists(valid_area_mask):
            raise RuntimeError(f"Valid area mask {valid_area_mask} does not exist.")

        self.valid_area_mask = cv2.imread(valid_area_mask, 0) > 0

__getitem__(index)

Get dataset item for the index index.

Parameters:

  • index (int) –

    Index to get the item.

Returns:

Source code in quadra/datasets/anomaly.py
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
    """Get dataset item for the index ``index``.

    Args:
        index: Index to get the item.

    Returns:
        Dict of image tensor during training.
        Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box.
    """
    item: Dict[str, Union[str, Tensor]] = {}

    image_path = self.samples.samples.iloc[index]
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    original_image_shape = image.shape
    if self.valid_area_mask is not None:
        image = image * self.valid_area_mask[:, :, np.newaxis]

    if self.crop_area is not None:
        image = image[self.crop_area[1] : self.crop_area[3], self.crop_area[0] : self.crop_area[2]]

    label_index = self.samples.label_index[index]

    if self.split == "train":
        pre_processed = self.transform(image=image)
        item = {"image": pre_processed["image"], "label": label_index}
    elif self.split in ["val", "test"]:
        item["image_path"] = image_path
        item["label"] = label_index

        if self.task == "segmentation":
            mask_path = self.samples.mask_path[index]

            # If good images have no associated mask create an empty one
            if label_index == 0:
                mask = np.zeros(shape=original_image_shape[:2])
            else:
                if os.path.isfile(mask_path):
                    mask = cv2.imread(mask_path, flags=0) / 255.0
                else:
                    # We need ones in the mask to compute correctly at least image level f1 score
                    mask = np.ones(shape=original_image_shape[:2])

            if self.valid_area_mask is not None:
                mask = mask * self.valid_area_mask

            if self.crop_area is not None:
                mask = mask[self.crop_area[1] : self.crop_area[3], self.crop_area[0] : self.crop_area[2]]

            pre_processed = self.transform(image=image, mask=mask)

            item["mask_path"] = mask_path
            item["mask"] = pre_processed["mask"]
        else:
            pre_processed = self.transform(image=image)

        item["image"] = pre_processed["image"]
    return item

__len__()

Get length of the dataset.

Source code in quadra/datasets/anomaly.py
224
225
226
def __len__(self) -> int:
    """Get length of the dataset."""
    return len(self.samples)

create_validation_set_from_test_set(samples, seed=0)

Craete Validation Set from Test Set.

This function creates a validation set from test set by splitting both normal and abnormal samples to two.

Parameters:

  • samples (DataFrame) –

    Dataframe containing dataset info such as filenames, splits etc.

  • seed (int, default: 0 ) –

    Random seed to ensure reproducibility. Defaults to 0.

Source code in quadra/datasets/anomaly.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def create_validation_set_from_test_set(samples: DataFrame, seed: int = 0) -> DataFrame:
    """Craete Validation Set from Test Set.

    This function creates a validation set from test set by splitting both
    normal and abnormal samples to two.

    Args:
        samples: Dataframe containing dataset info such as filenames, splits etc.
        seed: Random seed to ensure reproducibility. Defaults to 0.
    """
    if seed > 0:
        random.seed(seed)

    # Split normal images.
    normal_test_image_indices = samples.index[(samples.split == "test") & (samples.targets == "good")].to_list()
    num_normal_valid_images = len(normal_test_image_indices) // 2

    indices_to_sample = random.sample(population=normal_test_image_indices, k=num_normal_valid_images)
    samples.loc[indices_to_sample, "split"] = "val"

    # Split abnormal images.
    abnormal_test_image_indices = samples.index[(samples.split == "test") & (samples.targets != "good")].to_list()
    num_abnormal_valid_images = len(abnormal_test_image_indices) // 2

    indices_to_sample = random.sample(population=abnormal_test_image_indices, k=num_abnormal_valid_images)
    samples.loc[indices_to_sample, "split"] = "val"

    return samples

make_anomaly_dataset(path, split=None, split_ratio=0.1, seed=0, mask_suffix=None, create_test_set_if_empty=True)

Create dataframe by parsing a folder following the MVTec data file structure.

The files are expected to follow the structure

path/to/dataset/split/label/image_filename.xyz path/to/dataset/ground_truth/label/mask_filename.png

Masks MUST be png images, no other format is allowed Split can be either train/val/test

This function creates a dataframe to store the parsed information based on the following format: |---|---------------|-------|---------|--------------|-----------------------------------------------|-------------| | | path | split | targets | samples | mask_path | label_index | |---|---------------|-------|---------|--------------|-----------------------------------------------|-------------| | 0 | datasets/name | test | defect | filename.xyz | ground_truth/defect/filename{mask_suffix}.png | 1 | |---|---------------|-------|---------|--------------|-----------------------------------------------|-------------|

Parameters:

  • path (Path) –

    Path to dataset

  • split (Optional[str], default: None ) –

    Dataset split (i.e., either train or test). Defaults to None.

  • split_ratio (float, default: 0.1 ) –

    Ratio to split normal training images and add to the test set in case test set doesn't contain any normal images. Defaults to 0.1.

  • seed (int, default: 0 ) –

    Random seed to ensure reproducibility when splitting. Defaults to 0.

  • mask_suffix (Optional[str], default: None ) –

    String to append to the base filename to get the mask name, by default for MVTec dataset masks are saved as imagename_mask.png in this case the parameter shoul be filled with "_mask"

  • create_test_set_if_empty (bool, default: True ) –

    If True, create a test set if the test set is empty.

Example

The following example shows how to get training samples from MVTec bottle category:

root = Path('./MVTec') category = 'bottle' path = root / category path PosixPath('MVTec/bottle')

samples = make_anomaly_dataset(path, split='train', split_ratio=0.1, seed=0) samples.head() path split label image_path mask_path label_index 0 MVTec/bottle train good MVTec/bottle/train/good/105.png MVTec/bottle/ground_truth/good/105_mask.png 0 1 MVTec/bottle train good MVTec/bottle/train/good/017.png MVTec/bottle/ground_truth/good/017_mask.png 0 2 MVTec/bottle train good MVTec/bottle/train/good/137.png MVTec/bottle/ground_truth/good/137_mask.png 0 3 MVTec/bottle train good MVTec/bottle/train/good/152.png MVTec/bottle/ground_truth/good/152_mask.png 0 4 MVTec/bottle train good MVTec/bottle/train/good/109.png MVTec/bottle/ground_truth/good/109_mask.png 0

Returns:

  • DataFrame

    An output dataframe containing samples for the requested split (ie., train or test)

Source code in quadra/datasets/anomaly.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
def make_anomaly_dataset(
    path: Path,
    split: Optional[str] = None,
    split_ratio: float = 0.1,
    seed: int = 0,
    mask_suffix: Optional[str] = None,
    create_test_set_if_empty: bool = True,
) -> DataFrame:
    """Create dataframe by parsing a folder following the MVTec data file structure.

    The files are expected to follow the structure:
        path/to/dataset/split/label/image_filename.xyz
        path/to/dataset/ground_truth/label/mask_filename.png

    Masks MUST be png images, no other format is allowed
    Split can be either train/val/test

    This function creates a dataframe to store the parsed information based on the following format:
    |---|---------------|-------|---------|--------------|-----------------------------------------------|-------------|
    |   | path          | split | targets | samples      | mask_path                                     | label_index |
    |---|---------------|-------|---------|--------------|-----------------------------------------------|-------------|
    | 0 | datasets/name |  test |  defect | filename.xyz | ground_truth/defect/filename{mask_suffix}.png | 1           |
    |---|---------------|-------|---------|--------------|-----------------------------------------------|-------------|

    Args:
        path: Path to dataset
        split: Dataset split (i.e., either train or test). Defaults to None.
        split_ratio: Ratio to split normal training images and add to the
            test set in case test set doesn't contain any normal images.
            Defaults to 0.1.
        seed: Random seed to ensure reproducibility when splitting. Defaults to 0.
        mask_suffix: String to append to the base filename to get the mask name, by default for MVTec dataset masks
            are saved as imagename_mask.png in this case the parameter shoul be filled with "_mask"
        create_test_set_if_empty: If True, create a test set if the test set is empty.


    Example:
        The following example shows how to get training samples from MVTec bottle category:

        >>> root = Path('./MVTec')
        >>> category = 'bottle'
        >>> path = root / category
        >>> path
        PosixPath('MVTec/bottle')

        >>> samples = make_anomaly_dataset(path, split='train', split_ratio=0.1, seed=0)
        >>> samples.head()
           path         split label image_path                           mask_path                   label_index
        0  MVTec/bottle train good MVTec/bottle/train/good/105.png MVTec/bottle/ground_truth/good/105_mask.png 0
        1  MVTec/bottle train good MVTec/bottle/train/good/017.png MVTec/bottle/ground_truth/good/017_mask.png 0
        2  MVTec/bottle train good MVTec/bottle/train/good/137.png MVTec/bottle/ground_truth/good/137_mask.png 0
        3  MVTec/bottle train good MVTec/bottle/train/good/152.png MVTec/bottle/ground_truth/good/152_mask.png 0
        4  MVTec/bottle train good MVTec/bottle/train/good/109.png MVTec/bottle/ground_truth/good/109_mask.png 0

    Returns:
        An output dataframe containing samples for the requested split (ie., train or test)
    """
    samples_list = [
        (str(path),) + filename.parts[-3:]
        for filename in path.glob("**/*")
        if filename.is_file()
        and os.path.splitext(filename)[-1].lower() in IMAGE_EXTENSIONS
        and ".ipynb_checkpoints" not in str(filename)
    ]

    if len(samples_list) == 0:
        raise RuntimeError(f"Found 0 images in {path}")

    samples_list.sort()

    data = pd.DataFrame(samples_list, columns=["path", "split", "targets", "samples"])
    data = data[data.split != "ground_truth"]

    # Create mask_path column, masks MUST have png extension
    data["mask_path"] = (
        data.path
        + "/ground_truth/"
        + data.targets
        + "/"
        + data.samples.apply(lambda x: os.path.splitext(os.path.basename(x))[0])
        + (f"{mask_suffix}.png" if mask_suffix is not None else ".png")
    )

    # Modify image_path column by converting to absolute path
    data["samples"] = data.path + "/" + data.split + "/" + data.targets + "/" + data.samples

    # Split the normal images in training set if test set doesn't
    # contain any normal images. This is needed because AUC score
    # cannot be computed based on 1-class
    if sum((data.split == "test") & (data.targets == "good")) == 0 and create_test_set_if_empty:
        data = split_normal_images_in_train_set(data, split_ratio, seed)

    # Good images don't have mask
    data.loc[(data.split == "test") & (data.targets == "good"), "mask_path"] = ""

    # Create label index for normal (0), anomalous (1) and unknown (-1) images.
    data.loc[data.targets == "good", "label_index"] = 0
    data.loc[~data.targets.isin(["good", "unknown"]), "label_index"] = 1
    data.loc[data.targets == "unknown", "label_index"] = -1
    data.label_index = data.label_index.astype(int)

    # Get the data frame for the split.
    if split is not None and split in ["train", "val", "test"]:
        data = data[data.split == split]
        data = data.reset_index(drop=True)

    return data

split_normal_images_in_train_set(samples, split_ratio=0.1, seed=0)

Split normal images in train set.

This function splits the normal images in training set and assigns the
values to the test set. This is particularly useful especially when the
test set does not contain any normal images.

This is important because when the test set doesn't have any normal images,
AUC computation fails due to having single class.

Parameters:

  • samples (DataFrame) –

    Dataframe containing dataset info such as filenames, splits etc.

  • split_ratio (float, default: 0.1 ) –

    Train-Test normal image split ratio. Defaults to 0.1.

  • seed (int, default: 0 ) –

    Random seed to ensure reproducibility. Defaults to 0.

Returns:

  • DataFrame

    Output dataframe where the part of the training set is assigned to test set.

Source code in quadra/datasets/anomaly.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def split_normal_images_in_train_set(samples: DataFrame, split_ratio: float = 0.1, seed: int = 0) -> DataFrame:
    """Split normal images in train set.

        This function splits the normal images in training set and assigns the
        values to the test set. This is particularly useful especially when the
        test set does not contain any normal images.

        This is important because when the test set doesn't have any normal images,
        AUC computation fails due to having single class.

    Args:
        samples: Dataframe containing dataset info such as filenames, splits etc.
        split_ratio: Train-Test normal image split ratio. Defaults to 0.1.
        seed: Random seed to ensure reproducibility. Defaults to 0.

    Returns:
        Output dataframe where the part of the training set is assigned to test set.
    """
    if seed > 0:
        random.seed(seed)

    normal_train_image_indices = samples.index[(samples.split == "train") & (samples.targets == "good")].to_list()
    num_normal_train_images = len(normal_train_image_indices)
    num_normal_valid_images = int(num_normal_train_images * split_ratio)

    indices_to_split_from_train_set = random.sample(population=normal_train_image_indices, k=num_normal_valid_images)
    samples.loc[indices_to_split_from_train_set, "split"] = "test"

    return samples