Commit d2e4165b authored by Roberto Ugolotti's avatar Roberto Ugolotti
Browse files

Add notebook examples

parent 04ab862d
# Contents
This folder contains the notebooks used in the training course "Deep Learning on the JRC Data Platform". It contains two examples. The first
one uses Keras to create from scratch a Convolutional Neural Network to classify satellite images; the second one uses PyTorch to fine-tune
a pre-trained network for fake news detection.
\ No newline at end of file
%% Cell type:markdown id:8dbbbfff-6e38-415a-9cb3-08ae3a63d6de tags:
# Classify EuroSAT images using a simple network written in Keras
%% Cell type:markdown id:5618969b-62b3-463a-b89b-c2f792dc4d0a tags:
## SatImNet
SatImNet (https://www.mdpi.com/2072-4292/12/20/3358/htm) is a collection of open training data, structured and harmonized according to specific rules provided by BDAP. In this example we are going to use this library (still in development) to retrieve EuroSAT (https://github.com/phelber/eurosat) data in a format readily accessible through Keras and create a CNN-based classifier.
## Keras reader
Keras provides several utilities for feeding data into its networks. An example is https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image_dataset_from_directory that reads images from disk prepared in a simple structure. The DataGenerator class in SatImNet library reads a batch of images from a list and loads only those in memory limiting the resource usage. A simple cache mechanism furtherly reduces the accesses to disk.
%% Cell type:code id:cd06012e-2276-48a2-a0e0-b08f0b482cf8 tags:
``` python
from satimnet.dataset_utils import get_images_and_labels
from satimnet.keras_readers import DataGenerator
from sklearn.model_selection import train_test_split
import numpy as np
```
%% Cell type:code id:423ef7a1-55ef-4c6a-b26f-1d53ad8e3070 tags:
``` python
# Get names of images and labels from SatImNet
images, labels = get_images_and_labels('eurosat', 'rgb')
# Reduce amount of data (it will take too long for this demo)
images = images[::5]
labels = labels[::5]
```
%% Cell type:code id:bdee8ce5-43fb-4497-9011-43a75bc2de4a tags:
``` python
for image, label in zip(images[:5], labels):
print(image, '|', label)
```
%% Cell type:code id:bcb6ec6f-8e2c-4fc6-988b-8a129093c78e tags:
``` python
# Split into train and validation (it would be better to have a test too)
images_train, images_val, labels_train, labels_val = train_test_split(images, labels, test_size=0.33)
```
%% Cell type:code id:3d4c291a-3f11-4170-9428-2045105658fc tags:
``` python
image_size = (64, 64)
n_channels = 3 # RGB
num_classes = 10
label_names = ["AnnualCrop", "Forest", "HerbaceousVegetation", "Highway", "Industrial",
"Pasture", "PermanentCrop", "Residential", "River", "SeaLake"]
# Object that will read training data in random order (better for training)
dg_train = DataGenerator(images_train, labels_train, dim=image_size, n_channels=n_channels,
batch_size=32, n_classes=num_classes, shuffle=True)
# Object that will read validation data in sequential order (does not matter for validation)
dg_val = DataGenerator(images_val, labels_val, dim=image_size, n_channels=n_channels,
batch_size=32, n_classes=num_classes, shuffle=False)
```
%% Cell type:code id:d4332797-4ad3-4627-8382-025de3f9eca5 tags:
``` python
# Get a batch of data
gen_images, gen_labels = dg_train[0]
print('A batch contains %d images' % len(gen_labels))
```
%% Cell type:code id:401c76ba-e4da-4c21-b324-d0684de4521f tags:
``` python
import pylab as plt
_, axes = plt.subplots(3, 3, figsize=(10, 7))
axes = axes.ravel()
for img_id in range(9):
axes[img_id].imshow(gen_images[img_id] / 255)
axes[img_id].xaxis.set_visible(False)
axes[img_id].yaxis.set_visible(False)
axes[img_id].set_title(label_names[gen_labels[img_id].argmax()])
```
%% Cell type:code id:ca4ace2e-4e56-40d6-82fe-14dd89809c0b tags:
``` python
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
```
%% Cell type:code id:979c0a8c-28d3-4fb9-8036-372724bec604 tags:
``` python
# Check that Tensorflow will use GPU
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
assert 'GPU' in str(device_lib.list_local_devices())
```
%% Cell type:code id:ce4de5fa-205c-436e-8ee8-74f6713dded9 tags:
``` python
def make_model(input_shape, num_classes):
# Input layer: number of neurons is the size of the input (preserving the shape)
inputs = keras.Input(shape=input_shape)
# Entry block
x = layers.experimental.preprocessing.Rescaling(1.0 / 255)(inputs)
# A bunch of convolutional layers
x = layers.Conv2D(32, 3, strides=2, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.MaxPooling2D((3, 3), strides=2, padding="same")(x)
x = layers.Conv2D(64, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.MaxPooling2D((3, 3), strides=2, padding="same")(x)
x = layers.Conv2D(128, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.MaxPooling2D((3, 3), strides=2, padding="same")(x)
x = layers.Conv2D(1024, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.5)(x)
# Final layer, classification output a neuron for each class
outputs = layers.Dense(num_classes, activation='softmax')(x)
return keras.Model(inputs, outputs)
model = make_model(input_shape=(image_size[0], image_size[1], n_channels), num_classes=num_classes)
```
%% Cell type:code id:31e8f683-2b61-4003-9974-0b5d3f0a9724 tags:
``` python
model.summary()
```
%% Cell type:code id:8efe85d0-f46f-43fd-826f-373fdc64fbe8 tags:
``` python
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=1e-5), # Optimizer: how to update the weights
loss="categorical_crossentropy", # Loss: what the optimizer will optimize
metrics=["categorical_accuracy"] # Metric: what I will use to evaluate the model
)
```
%% Cell type:code id:a490b520-90e4-40d2-a1e0-ed0c999cfd99 tags:
``` python
# At each epoch, the entire training set is presented to the network
epochs = 10
# Every epoch, save the best model to disk
save_model_cb = keras.callbacks.ModelCheckpoint('best_model.h5', monitor='val_categorical_accuracy',
save_best_only=True, verbose=1, save_freq='epoch')
model.fit(
dg_train, epochs=epochs, validation_data=dg_val, verbose=1,
callbacks=[save_model_cb]
)
```
%% Cell type:code id:dfc8f949-3516-479b-a28c-57951354bc02 tags:
``` python
# How to load the best model
best_model = keras.models.load_model('best_model.h5')
```
%% Cell type:code id:57f058a4-0df4-408e-9f11-71313ab4b86f tags:
``` python
# Let's test the network
predictions = best_model.predict(dg_val)
np.set_printoptions(suppress=True)
print('This is how a single response of the network looks like:\n', predictions[0])
print('So, the class predicted by the network is %d -> %s' % (predictions[0].argmax(), label_names[predictions[0].argmax()]))
print('All predictions for all inputs are:', predictions.argmax(axis=1))
```
%% Cell type:code id:4c23f0ae-07b4-4a71-b4d5-8ce8716d1d46 tags:
``` python
true_labels = np.concatenate([x[1] for x in dg_val]).argmax(axis=1)
pred_labels = predictions.argmax(axis=1)
```
%% Cell type:code id:fcaf92f4-6e48-4df3-9dc1-e1813a5b19e3 tags:
``` python
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(pred_labels, true_labels)
```
%% Cell type:code id:e6037851-3b01-48b2-b4db-827545f730a2 tags:
``` python
import seaborn as sns
sns.heatmap(cm, annot=True, fmt='d', xticklabels=label_names, yticklabels=label_names)
```
%% Cell type:markdown id:c00cd2f9-8400-4cef-aedc-a4a59146bf47 tags:
## We have learned
* Read SatImNet data
* Split a dataset between train and validation set using Scikit-learn
* Create a convolutional neural network with Keras
* Train a network
* Evaluate the results
""" Package to read Machine Library dataset in EOS. """
\ No newline at end of file
import json
import os
from glob import glob
from zipfile import ZipFile
from tqdm import tqdm
from .readers import MAINFOLDER
DATASETS = {
'airbus-ship':
{
'path': os.path.join(MAINFOLDER, 'Airbus-ship')
},
'bigearthnet':
{
'path': os.path.join(MAINFOLDER, 'BigEarthNet-v1.0')
},
'clouds-s2-taiwan':
{
'path': os.path.join(MAINFOLDER, 'clouds-s2-taiwan')
},
'dota':
{
'path': os.path.join(MAINFOLDER, 'DOTA')
},
'eurosat':
{
'path': os.path.join(MAINFOLDER, 'EuroSAT'),
'classes': {
"AnnualCrop": 0,
"Forest": 1,
"HerbaceousVegetation": 2,
"Highway": 3,
"Industrial": 4,
"Pasture": 5,
"PermanentCrop": 6,
"Residential": 7,
"River": 8,
"SeaLake": 9
}
},
'inria_aerial_image_labeling':
{
'path': os.path.join(MAINFOLDER, 'Inria_Aerial_Image_Labeling')
},
'xview':
{
'path': os.path.join(MAINFOLDER, 'xView')
},
}
# To read data using GDAL
ZIP_PREFIX = '/vsizip/'
def get_images_and_labels(dataset, *args, **kwargs):
""" Returns two lists for images and labels (or None instead of labels if not available. """
dataset = dataset.lower()
if dataset not in DATASETS.keys():
raise RuntimeError('Invalid dataset', dataset)
if dataset == 'airbus-ship':
return _get_ial_airbus(*args, **kwargs)
if dataset == 'bigearthnet':
raise NotImplementedError('Big Earth Net dataset still not accessible')
return _get_ial_bigearthnet(*args, **kwargs)
if dataset == 'clouds-s2-taiwan':
return _get_ial_clouds(*args, **kwargs)
if dataset == 'dota':
return _get_ial_dota(*args, **kwargs)
if dataset == 'eurosat':
return _get_ial_eurosat(*args, **kwargs)
if dataset == 'inria_aerial_image_labeling':
return _get_ial_inria_aerial_image_labeling(*args, **kwargs)
if dataset == 'xview':
raise NotImplementedError('xView dataset still not accessible')
return _get_ial_xview(*args, **kwargs)
def __airbus_image_path(main_path, set_name, fname):
return os.path.join(ZIP_PREFIX + main_path, set_name, 'images', '%s.zip' % fname[0], fname + '.jpg')
def __airbus_label_path(main_path, fname):
return os.path.join(ZIP_PREFIX + main_path, 'train', 'labels_png.zip', fname + '.png')
def _get_ial_airbus(set_name, only_training_images_with_labels=False):
""" Retrieve Airbus-ship data
Args:
set_name (str): train or test
only_training_images_with_labels (bool): used only when `set_name` is `train`.
If True, returns only images with associated labels. If False, returns all images with a None
instead of label
Returns:
two lists with images and labels location if `set_name` is `train`
list with images and None if `set_name` is `test`
"""
assert set_name in ('train', 'test')
main_path = DATASETS['airbus-ship']['path']
if set_name == 'train':
fzip = os.path.join(main_path, set_name, 'labels_png.zip')
fnames = ZipFile(fzip).namelist()
label_names = [f[:-4] for f in fnames]
label_paths = [os.path.join(ZIP_PREFIX + fzip, fname) for fname in fnames]
if only_training_images_with_labels:
image_paths = [__airbus_image_path(main_path, set_name, n) for n in label_names]
else:
zipfiles = glob(os.path.join(main_path, set_name, 'images', '*.zip'))
images_list = list()
image_paths = list()
for fzip in zipfiles:
fnames = ZipFile(fzip).namelist()
images_list.extend([f[:-4] for f in fnames])
fnames = [os.path.join(ZIP_PREFIX + fzip, fname) for fname in fnames]
image_paths.extend(fnames)
label_paths = list()
label_names = set(label_names)
for im_name in images_list:
if im_name in label_names:
label_paths.append(__airbus_label_path(main_path, im_name))
else:
label_paths.append(None)
else:
label_paths = None
zipfiles = glob(os.path.join(main_path, set_name, 'images', '*.zip'))
image_paths = list()
for fzip in zipfiles:
fnames = ZipFile(fzip).namelist()
fnames = [os.path.join(ZIP_PREFIX + fzip, fname) for fname in fnames]
image_paths.extend(fnames)
return image_paths, label_paths
def _get_ial_bigearthnet(band, verbose=False):
""" TODO: this has to be done using the json files otherwise it is too slow"""
classes = {
"Agro-forestry areas": 1,
"Annual crops associated with permanent crops": 2,
"Bare rock": 3,
"Beaches dunes sands": 4,
"Broad-leaved forest": 5,
"Burnt areas": 6,
"Coastal lagoons": 7,
"Complex cultivation patterns": 8,
"Coniferous forest": 9,
"Construction sites": 10,
"Continuous urban fabric": 11,
"Discontinuous urban fabric": 12,
"Dump sites": 13,
"Estuaries": 14,
"Fruit trees and berry plantations": 15,
"Green urban areas": 16,
"Industrial or commercial units": 17,
"Inland marshes": 18,
"Intertidal flats": 19,
"Land principally occupied by agriculture, with significant areas of natural vegetation": 20,
"Mineral extraction sites": 21,
"Mixed forest": 22,
"Moors and heathland": 23,
"Natural grassland": 24,
"Non-irrigated arable land": 25,
"Olive groves": 26,
"Pastures": 27,
"Peatbogs": 28,
"Permanently irrigated land": 29,
"Port areas": 30,
"Road and rail networks and associated land": 31,
"Salines": 32,
"Salt marshes": 33,
"Sclerophyllous vegetation": 34,
"Sea and ocean": 35,
"Sparsely vegetated areas": 36,
"Sport and leisure facilities": 37,
"Transitional woodland/shrub": 38,
"Vineyards": 39,
"Water bodies": 40,
"Water courses": 41,
"Airports": 42,
"Beaches, dunes, sands": 43,
"Rice fields": 44,
}
assert band in ('B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B09', 'B8A', 'B11', 'B12')
main_path = DATASETS['bigearthnet']['path']
zipfiles = glob(os.path.join(main_path, '**', '**', '*.zip'))
image_paths = list()
labels = list()
for fzip in tqdm(zipfiles, disable=not verbose):
z = ZipFile(fzip)
folders = {os.path.dirname(f) for f in z.namelist()}
for folder in folders:
# Read image
image_paths.append(os.path.join(ZIP_PREFIX + fzip, folder, '%s_%s.tif' % (folder, band)))
# Read label
json_name = os.path.join(folder, '%s_labels_metadata.json' % folder)
labels.append([classes[l] for l in json.loads(z.open(json_name).read())['labels']])
return image_paths, labels
def _get_ial_clouds():
main_path = DATASETS['clouds-s2-taiwan']['path']
zipname = os.path.join(main_path, 'train', 'images.zip')
images = [os.path.join(ZIP_PREFIX + zipname, f) for f in ZipFile(zipname).namelist()]
zipname = os.path.join(main_path, 'train', 'labels.zip')
labels = [os.path.join(ZIP_PREFIX + zipname, f) for f in ZipFile(zipname).namelist()]
return images, labels
def _get_ial_dota(set_name, label_version=None, label_type=None):
def __remove_image_from_list(name, images):
for im in images:
if name in im:
images.remove(im)
break
main_path = DATASETS['dota']['path']
assert set_name in ('train', 'val', 'test')
if set_name != 'test':
assert label_version in ('v1.0', 'v1.5'), 'Invalid label_version: %s' % label_version
assert label_type in ('pixel', 'bounding-box'), 'Invalid label_type: %s' % label_type
if label_version == 'v1.0':
if label_type == 'pixel':
labels_path = os.path.join('labelTxt-v1.0', 'labelTxt.zip')
else:
if set_name == 'train':
labels_path = os.path.join('labelTxt-v1.0', 'Train_Task2_gt.zip')
else:
labels_path = os.path.join('labelTxt-v1.0', 'valset_reclabelTxt.zip')
else:
if label_type == 'pixel':
labels_path = os.path.join('labelTxt-v1.5', 'DOTA-v1.5_%s.zip' % set_name)
else:
labels_path = os.path.join('labelTxt-v1.5', 'DOTA-v1.5_%s_hbb.zip' % set_name)
images = glob(os.path.join(main_path, set_name, 'images', '*.png'))
if set_name == 'test':
labels = None
else:
labels_path = os.path.join(main_path, set_name, labels_path)
labels = [os.path.join(ZIP_PREFIX + labels_path, f) for f in ZipFile(labels_path).namelist()]
# There are two images that do not have the labels, I am going to remove them manually
if set_name == 'train':
__remove_image_from_list('P1531', images)
if label_version == 'v1.0':
__remove_image_from_list('P2123', images)
elif set_name == 'val':
__remove_image_from_list('P2152', images)
__remove_image_from_list('P2330', images)
return images, labels
def _get_ial_eurosat(bands):
assert bands in ('rgb', 'all_bands')
main_path = DATASETS['eurosat']['path']
classes = DATASETS['eurosat']['classes']
zipfiles = glob(os.path.join(main_path, bands, 'images', '*.zip'))
labels = list()
images = list()
for zipfile in zipfiles:
tmp = [os.path.join(ZIP_PREFIX + zipfile, f) for f in ZipFile(zipfile).namelist()]
labels.extend([classes[os.path.basename(zipfile)[:-4]]] * len(tmp))
images.extend(tmp)
return images, labels
def _get_ial_inria_aerial_image_labeling(set_name):
assert set_name in ('train', 'test')
main_path = DATASETS['inria_aerial_image_labeling']['path']
images = glob(os.path.join(main_path, set_name, 'images', '*.tif'))
if set_name == 'train':
labels = glob(os.path.join(main_path, set_name, 'labels', '*.tif'))
else:
labels = None
return images, labels
def _get_ial_xview(*args, **kwargs):
return None
import os
import json
try:
from math import prod
except ImportError:
from operator import mul
from functools import reduce
def prod(iterable):
return reduce(mul, iterable, 1)
import numpy as np
from cachetools import cached, RRCache
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.ops import image_ops
from . import readers
class DataGenerator(keras.utils.Sequence):
'Generates data for Keras'
def __init__(self, images, labels, batch_size=32, dim=None, n_channels=None, dtype=np.float32,
n_classes=10, shuffle=True, resize=False, cache_size_mb=128, normalize_masks=False,
labels_policy_if_none='none'):
"""
Args:
images (list[str]): list with files containing input images
labels (list[str], list[int], None): list with files containing labels/masks OR list of integers with
labels OR None
batch_size (int): batch size
dim (tuple): size of a single input image. If resize is True, all images will be resized to this size
n_channels (int): number of channels of input images
dtype (type): data type of input images
n_classes (int): number of classes for labels
shuffle (bool): if True, present dataset in different order every time
resize (bool): if True, resize an image when its dimension is different than expected. If False, raises
an error
cache_size_mb (int): data is loaded from disk as little as possible. Set this number to use a cache
normalize_masks (bool): if labels are images in to be used as masks (e.g. in segmentation tasks) if this
flag is True, it expects input data to be [0, 255] to be normalized to [0, 1]
labels_policy_if_none (bool): if labels is not provided (None) decide what to do. If 'none', returns None
every time; if 'int': returns a list with 0 as long as batch_size; if 'image': returns a mask with
all 0s of the same dimension of the images
"""
self.batch_size = batch_size
self.labels = labels
self.images = images
self.dim = dim
self.n_channels = n_channels
self.n_classes = n_classes
self.dtype = dtype
self.shuffle = shuffle
self.resize = resize
self.normalize_masks = normalize_masks
cache_max_size = int(cache_size_mb * 1048576 / (np.dtype(dtype).itemsize * prod(dim) * n_channels))
self.read_fun = cached(RRCache(maxsize=cache_max_size))(self._read_image)
self.labels_policy_if_none = labels_policy_if_none
if labels is None:
self._label_type = 'none'
if labels_policy_if_none not in ('none', 'int', 'image'):
raise ValueError('If labels is not specified, you have to choose the policy')
elif type(labels[0]) == int:
self._label_type = 'int'
elif type(labels[0]) == str:
self._label_type = 'image'