Commit d2e4165b authored by Roberto Ugolotti's avatar Roberto Ugolotti
Browse files

Add notebook examples

parent 04ab862d
# Contents
This folder contains the notebooks used in the training course "Deep Learning on the JRC Data Platform". It contains two examples. The first
one uses Keras to create from scratch a Convolutional Neural Network to classify satellite images; the second one uses PyTorch to fine-tune
a pre-trained network for fake news detection.
\ No newline at end of file
%% Cell type:markdown id:8dbbbfff-6e38-415a-9cb3-08ae3a63d6de tags:
# Classify EuroSAT images using a simple network written in Keras
%% Cell type:markdown id:5618969b-62b3-463a-b89b-c2f792dc4d0a tags:
## SatImNet
SatImNet (https://www.mdpi.com/2072-4292/12/20/3358/htm) is a collection of open training data, structured and harmonized according to specific rules provided by BDAP. In this example we are going to use this library (still in development) to retrieve EuroSAT (https://github.com/phelber/eurosat) data in a format readily accessible through Keras and create a CNN-based classifier.
## Keras reader
Keras provides several utilities for feeding data into its networks. An example is https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image_dataset_from_directory that reads images from disk prepared in a simple structure. The DataGenerator class in SatImNet library reads a batch of images from a list and loads only those in memory limiting the resource usage. A simple cache mechanism furtherly reduces the accesses to disk.
%% Cell type:code id:cd06012e-2276-48a2-a0e0-b08f0b482cf8 tags:
``` python
from satimnet.dataset_utils import get_images_and_labels
from satimnet.keras_readers import DataGenerator
from sklearn.model_selection import train_test_split
import numpy as np
```
%% Cell type:code id:423ef7a1-55ef-4c6a-b26f-1d53ad8e3070 tags:
``` python
# Get names of images and labels from SatImNet
images, labels = get_images_and_labels('eurosat', 'rgb')
# Reduce amount of data (it will take too long for this demo)
images = images[::5]
labels = labels[::5]
```
%% Cell type:code id:bdee8ce5-43fb-4497-9011-43a75bc2de4a tags:
``` python
for image, label in zip(images[:5], labels):
print(image, '|', label)
```
%% Cell type:code id:bcb6ec6f-8e2c-4fc6-988b-8a129093c78e tags:
``` python
# Split into train and validation (it would be better to have a test too)
images_train, images_val, labels_train, labels_val = train_test_split(images, labels, test_size=0.33)
```
%% Cell type:code id:3d4c291a-3f11-4170-9428-2045105658fc tags:
``` python
image_size = (64, 64)
n_channels = 3 # RGB
num_classes = 10
label_names = ["AnnualCrop", "Forest", "HerbaceousVegetation", "Highway", "Industrial",
"Pasture", "PermanentCrop", "Residential", "River", "SeaLake"]
# Object that will read training data in random order (better for training)
dg_train = DataGenerator(images_train, labels_train, dim=image_size, n_channels=n_channels,
batch_size=32, n_classes=num_classes, shuffle=True)
# Object that will read validation data in sequential order (does not matter for validation)
dg_val = DataGenerator(images_val, labels_val, dim=image_size, n_channels=n_channels,
batch_size=32, n_classes=num_classes, shuffle=False)
```
%% Cell type:code id:d4332797-4ad3-4627-8382-025de3f9eca5 tags:
``` python
# Get a batch of data
gen_images, gen_labels = dg_train[0]
print('A batch contains %d images' % len(gen_labels))
```
%% Cell type:code id:401c76ba-e4da-4c21-b324-d0684de4521f tags:
``` python
import pylab as plt
_, axes = plt.subplots(3, 3, figsize=(10, 7))
axes = axes.ravel()
for img_id in range(9):
axes[img_id].imshow(gen_images[img_id] / 255)
axes[img_id].xaxis.set_visible(False)
axes[img_id].yaxis.set_visible(False)
axes[img_id].set_title(label_names[gen_labels[img_id].argmax()])
```
%% Cell type:code id:ca4ace2e-4e56-40d6-82fe-14dd89809c0b tags:
``` python
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
```
%% Cell type:code id:979c0a8c-28d3-4fb9-8036-372724bec604 tags:
``` python
# Check that Tensorflow will use GPU
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
assert 'GPU' in str(device_lib.list_local_devices())
```
%% Cell type:code id:ce4de5fa-205c-436e-8ee8-74f6713dded9 tags:
``` python
def make_model(input_shape, num_classes):
# Input layer: number of neurons is the size of the input (preserving the shape)
inputs = keras.Input(shape=input_shape)
# Entry block
x = layers.experimental.preprocessing.Rescaling(1.0 / 255)(inputs)
# A bunch of convolutional layers
x = layers.Conv2D(32, 3, strides=2, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.MaxPooling2D((3, 3), strides=2, padding="same")(x)
x = layers.Conv2D(64, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.MaxPooling2D((3, 3), strides=2, padding="same")(x)
x = layers.Conv2D(128, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.MaxPooling2D((3, 3), strides=2, padding="same")(x)
x = layers.Conv2D(1024, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.5)(x)
# Final layer, classification output a neuron for each class
outputs = layers.Dense(num_classes, activation='softmax')(x)
return keras.Model(inputs, outputs)
model = make_model(input_shape=(image_size[0], image_size[1], n_channels), num_classes=num_classes)
```
%% Cell type:code id:31e8f683-2b61-4003-9974-0b5d3f0a9724 tags:
``` python
model.summary()
```
%% Cell type:code id:8efe85d0-f46f-43fd-826f-373fdc64fbe8 tags:
``` python
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=1e-5), # Optimizer: how to update the weights
loss="categorical_crossentropy", # Loss: what the optimizer will optimize
metrics=["categorical_accuracy"] # Metric: what I will use to evaluate the model
)
```
%% Cell type:code id:a490b520-90e4-40d2-a1e0-ed0c999cfd99 tags:
``` python
# At each epoch, the entire training set is presented to the network
epochs = 10
# Every epoch, save the best model to disk
save_model_cb = keras.callbacks.ModelCheckpoint('best_model.h5', monitor='val_categorical_accuracy',
save_best_only=True, verbose=1, save_freq='epoch')
model.fit(
dg_train, epochs=epochs, validation_data=dg_val, verbose=1,
callbacks=[save_model_cb]
)
```
%% Cell type:code id:dfc8f949-3516-479b-a28c-57951354bc02 tags:
``` python
# How to load the best model
best_model = keras.models.load_model('best_model.h5')
```
%% Cell type:code id:57f058a4-0df4-408e-9f11-71313ab4b86f tags:
``` python
# Let's test the network
predictions = best_model.predict(dg_val)
np.set_printoptions(suppress=True)
print('This is how a single response of the network looks like:\n', predictions[0])
print('So, the class predicted by the network is %d -> %s' % (predictions[0].argmax(), label_names[predictions[0].argmax()]))
print('All predictions for all inputs are:', predictions.argmax(axis=1))
```
%% Cell type:code id:4c23f0ae-07b4-4a71-b4d5-8ce8716d1d46 tags:
``` python
true_labels = np.concatenate([x[1] for x in dg_val]).argmax(axis=1)
pred_labels = predictions.argmax(axis=1)
```
%% Cell type:code id:fcaf92f4-6e48-4df3-9dc1-e1813a5b19e3 tags:
``` python
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(pred_labels, true_labels)
```
%% Cell type:code id:e6037851-3b01-48b2-b4db-827545f730a2 tags:
``` python
import seaborn as sns
sns.heatmap(cm, annot=True, fmt='d', xticklabels=label_names, yticklabels=label_names)
```
%% Cell type:markdown id:c00cd2f9-8400-4cef-aedc-a4a59146bf47 tags:
## We have learned
* Read SatImNet data
* Split a dataset between train and validation set using Scikit-learn
* Create a convolutional neural network with Keras
* Train a network
* Evaluate the results
""" Package to read Machine Library dataset in EOS. """
\ No newline at end of file
import json
import os
from glob import glob
from zipfile import ZipFile
from tqdm import tqdm
from .readers import MAINFOLDER
DATASETS = {
'airbus-ship':
{
'path': os.path.join(MAINFOLDER, 'Airbus-ship')
},
'bigearthnet':
{
'path': os.path.join(MAINFOLDER, 'BigEarthNet-v1.0')
},
'clouds-s2-taiwan':
{
'path': os.path.join(MAINFOLDER, 'clouds-s2-taiwan')
},
'dota':
{
'path': os.path.join(MAINFOLDER, 'DOTA')
},
'eurosat':
{
'path': os.path.join(MAINFOLDER, 'EuroSAT'),
'classes': {
"AnnualCrop": 0,
"Forest": 1,
"HerbaceousVegetation": 2,
"Highway": 3,
"Industrial": 4,
"Pasture": 5,
"PermanentCrop": 6,
"Residential": 7,
"River": 8,
"SeaLake": 9
}
},
'inria_aerial_image_labeling':
{
'path': os.path.join(MAINFOLDER, 'Inria_Aerial_Image_Labeling')
},
'xview':
{
'path': os.path.join(MAINFOLDER, 'xView')
},
}
# To read data using GDAL
ZIP_PREFIX = '/vsizip/'
def get_images_and_labels(dataset, *args, **kwargs):
""" Returns two lists for images and labels (or None instead of labels if not available. """
dataset = dataset.lower()
if dataset not in DATASETS.keys():
raise RuntimeError('Invalid dataset', dataset)
if dataset == 'airbus-ship':
return _get_ial_airbus(*args, **kwargs)
if dataset == 'bigearthnet':
raise NotImplementedError('Big Earth Net dataset still not accessible')
return _get_ial_bigearthnet(*args, **kwargs)
if dataset == 'clouds-s2-taiwan':
return _get_ial_clouds(*args, **kwargs)
if dataset == 'dota':
return _get_ial_dota(*args, **kwargs)
if dataset == 'eurosat':
return _get_ial_eurosat(*args, **kwargs)
if dataset == 'inria_aerial_image_labeling':
return _get_ial_inria_aerial_image_labeling(*args, **kwargs)
if dataset == 'xview':
raise NotImplementedError('xView dataset still not accessible')
return _get_ial_xview(*args, **kwargs)
def __airbus_image_path(main_path, set_name, fname):
return os.path.join(ZIP_PREFIX + main_path, set_name, 'images', '%s.zip' % fname[0], fname + '.jpg')
def __airbus_label_path(main_path, fname):
return os.path.join(ZIP_PREFIX + main_path, 'train', 'labels_png.zip', fname + '.png')
def _get_ial_airbus(set_name, only_training_images_with_labels=False):