Skip to content

Commit

Permalink
Merge pull request #36 from DroneML/34-add-propper-logging
Browse files Browse the repository at this point in the history
34 add propper logging
  • Loading branch information
cwmeijer authored Dec 19, 2024
2 parents ff1ff62 + f431418 commit 8689614
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 21 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ cd segmentmytiff
python -m pip install .
```

## Logging
The application writes logs to the 'logs' dir, which will be created if it doesn't exist yet. Messages printed to the screen (```stdout```) are stored in ```info.log``` for later reference. More detailed information, such as input data shapes and value distributions, are written to ```debug.log```.

## Train a feature extraction model

To train a feature extraction model run the script "train_model.py" in this repo:
Expand Down
56 changes: 56 additions & 0 deletions src/segmentmytiff/logging_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import logging
import sys
import time
from contextlib import contextmanager
from pathlib import Path
from logging import handlers

import numpy as np
import pandas as pd


def setup_logger(name: str = None, stdout_level=logging.INFO):
"""
Setup a logger with a specific name and logging level.
"""

path = Path('log')
path.mkdir(exist_ok=True, parents=True)

formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")

# Create handlers
debug_fh = logging.handlers.RotatingFileHandler(path / 'debug.log', maxBytes=10 * 1024 * 1024, backupCount=5)
debug_fh.setLevel(logging.DEBUG)
debug_fh.setFormatter(formatter)
info_fh = logging.handlers.RotatingFileHandler(path / 'info.log')
info_fh.setLevel(logging.INFO)
info_fh.setFormatter(formatter)
stdout_sh = logging.StreamHandler(sys.stdout)
stdout_sh.setLevel(stdout_level)
stdout_sh.setFormatter(formatter)

# Configure logger and add handlers
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)
root_logger.addHandler(debug_fh)
root_logger.addHandler(info_fh)
root_logger.addHandler(stdout_sh)

return root_logger


@contextmanager
def log_duration(task_name: str, logger: logging.Logger):
"""
Log duration of a task.
"""
logger.info(f"{task_name} started")
start_time = time.perf_counter()
yield
duration = time.perf_counter() - start_time
logger.info(f"{task_name} finished in {duration:.4f} seconds")


def log_array(data: np.ndarray, logger, array_name:str="array") -> None:
logger.debug(f"{array_name} shape: {data.shape}")
53 changes: 40 additions & 13 deletions src/segmentmytiff/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,18 @@
from sklearn.ensemble import RandomForestClassifier

from segmentmytiff.features import get_features, FeatureType
from segmentmytiff.logging_config import setup_logger, log_duration, log_array
from segmentmytiff.utils.io import read_geotiff, save_tiff

logger = setup_logger(__name__)


def read_input_and_labels_and_save_predictions(input_path: Path, labels_path: Path, output_path: Path,
feature_type=FeatureType.IDENTITY, features_path:Path=None) -> None:
feature_type=FeatureType.IDENTITY, features_path: Path = None) -> None:
logger.info("read_input_and_labels_and_save_predictions called with the following arguments:")
for k, v in locals().items():
logger.info(f"{k}: {v}")

input_data, profile = read_geotiff(input_path)

features = get_features(input_data, input_path, feature_type, features_path, profile)
Expand All @@ -30,22 +37,42 @@ def make_predictions(input_data: ndarray, labels: ndarray) -> ndarray:
:param labels: labels with shape [1, width, height]
:return: probabilities with shape [class_values, width, height]
"""
print(labels.shape)
with log_duration("Prepare train data", logger):
train_data, train_labels = prepare_training_data(input_data, labels)

classifier = RandomForestClassifier(n_estimators=100, random_state=0)

with log_duration("Train model", logger):
classifier.fit(train_data, train_labels)

with log_duration("Make predictions", logger):
predictions = classifier.predict_proba(input_data.reshape((input_data.shape[0], -1)).transpose())
prediction_map = predictions.transpose().reshape((predictions.shape[1], *input_data.shape[1:]))
log_array(prediction_map, logger, array_name="Predictions")

return prediction_map


def prepare_training_data(input_data, labels):
class1_labels = labels[0] # Only single class is supported
flattened = class1_labels.flatten()
positive_instances = input_data.reshape((input_data.shape[0], -1))[:, flattened == 1].transpose()
negative_instances = input_data.reshape((input_data.shape[0], -1))[:, flattened == 0].transpose()
print('instances:', positive_instances.shape, negative_instances.shape)
train_data = np.concatenate((positive_instances, negative_instances))
train_labels = np.concatenate(((flattened[flattened == 1]), (flattened[flattened == 0])))
print('train data', train_labels.shape, train_data.shape)
classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(train_data, train_labels)
predictions = classifier.predict_proba(input_data.reshape((input_data.shape[0], -1)).transpose())
print('predictions', predictions.shape, pd.DataFrame(predictions).value_counts())
prediction_map = predictions.transpose().reshape((predictions.shape[1], *input_data.shape[1:]))
print('prediction_map shape', prediction_map.shape)
return prediction_map
n_positive = positive_instances.shape[0]
n_negative = negative_instances.shape[0]
n_labeled = n_negative + n_positive
n_unlabeled = np.prod(labels.shape[-2:]) - n_labeled
logger.info(
f"{n_labeled} ({round(n_labeled / (n_labeled + n_unlabeled), 2)}%) labeled instances of a total of {n_labeled + n_unlabeled} instances.")
logger.info(
f"Training on {n_positive} ({round(100 * n_positive / n_labeled, 2)}%) positive labels and {n_negative} ({round(100 * n_negative / n_labeled, 2)}%) negative labels ")
order = np.arange(n_labeled)
np.random.shuffle(order)
train_data = np.concatenate((positive_instances, negative_instances))[order]
train_labels = np.concatenate(((flattened[flattened == 1]), (flattened[flattened == 0])))[order]
log_array(train_labels, logger, array_name="Train labels")
log_array(train_data, logger, array_name="Train data")
return train_data, train_labels


def parse_args():
Expand Down
3 changes: 2 additions & 1 deletion src/segmentmytiff/utils/datasets.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from pathlib import Path

from PIL import Image
Expand All @@ -19,7 +20,7 @@ def image_path_to_mask_path(image_path: Path) -> Path:
self.masks = [str(image_path_to_mask_path(Path(p))) for p in self.images][:self.limit]
non_existing_masks = [p for p in self.masks if Path(p).exists() == False]
if non_existing_masks:
print(f"{len(non_existing_masks)} of a total of {len(self.masks)} masks not found.")
logging.getLogger().warning(f"{len(non_existing_masks)} of a total of {len(self.masks)} masks not found.")

if self.limit is None:
self.limit = len(self.images)
Expand Down
6 changes: 6 additions & 0 deletions src/segmentmytiff/utils/io.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import logging
from pathlib import Path
from typing import Any, Union

import numpy as np
import rasterio

from segmentmytiff.logging_config import log_array

logger = logging.getLogger(__name__)


def read_geotiff(input_path: Path) -> (np.ndarray, Any):
with rasterio.open(input_path) as src:
data = src.read()
profile = src.profile
log_array(data, logger, array_name=input_path)
return data, profile


Expand Down
18 changes: 11 additions & 7 deletions src/segmentmytiff/utils/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from torchinfo import summary
from tqdm import tqdm

from segmentmytiff.logging_config import setup_logger

try:
import mlflow

Expand All @@ -21,6 +23,8 @@
from segmentmytiff.utils.performance_metrics import dice_coefficient


logger = setup_logger(__name__)

def main(root_path, use_mlflow=True, train_set_limit=None, epochs=10, model_scale=1):
run_name = f"flair_toy_ep{epochs}_scale{str(model_scale).replace('.','_')}{'_tslim_'+str(train_set_limit) if train_set_limit is not None else ''}"

Expand Down Expand Up @@ -90,13 +94,13 @@ def train(model, train_dataloader, validation_dataloader, criterion, optimizer,
train_loss, train_dice = train_one_step(model, train_dataloader, optimizer, criterion, device)
val_loss, val_dice = validate(model, validation_dataloader, criterion, device)

print("-" * 30)
print(f"Training Loss EPOCH {epoch}: {train_loss:.4f}")
print(f"Training DICE EPOCH {epoch}: {train_dice:.4f}")
print("\n")
print(f"Validation Loss EPOCH {epoch}: {val_loss:.4f}")
print(f"Validation DICE EPOCH {epoch}: {val_dice:.4f}")
print("-" * 30)
logger.info("-" * 30)
logger.info(f"Training Loss EPOCH {epoch}: {train_loss:.4f}")
logger.info(f"Training DICE EPOCH {epoch}: {train_dice:.4f}")
logger.info("\n")
logger.info(f"Validation Loss EPOCH {epoch}: {val_loss:.4f}")
logger.info(f"Validation DICE EPOCH {epoch}: {val_dice:.4f}")
logger.info("-" * 30)

metrics = {
"train_loss": train_loss,
Expand Down

0 comments on commit 8689614

Please sign in to comment.