DroneML · cwmeijer · Dec 19, 2024 · Dec 17, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/README.md b/README.md
@@ -16,6 +16,9 @@ cd segmentmytiff
 python -m pip install .
 ```
 
+## Logging
+The application writes logs to the 'logs' dir, which will be created if it doesn't exist yet. Messages printed to the screen (```stdout```) are stored in ```info.log``` for later reference. More detailed information, such as input data shapes and value distributions, are written to ```debug.log```.
+
 ## Train a feature extraction model
 
 To train a feature extraction model run the script "train_model.py" in this repo:

diff --git a/src/segmentmytiff/logging_config.py b/src/segmentmytiff/logging_config.py
@@ -0,0 +1,63 @@
+import logging
+import sys
+import time
+from contextlib import contextmanager
+from pathlib import Path
+from logging import handlers
+
+import numpy as np
+import pandas as pd
+
+
+def setup_logger(name: str = None, level=logging.INFO):
+    """
+    Setup a logger with a specific name and logging level.
+    """
+
+    path = Path('log')
+    path.mkdir(exist_ok=True, parents=True)
+
+    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+
+    # Create handlers
+    debug_fh = logging.handlers.RotatingFileHandler(path / 'debug.log', maxBytes=10 * 1024 * 1024, backupCount=5)
+    debug_fh.setLevel(logging.DEBUG)
+    debug_fh.setFormatter(formatter)
+    info_fh = logging.handlers.RotatingFileHandler(path / 'info.log')
+    info_fh.setLevel(logging.INFO)
+    info_fh.setFormatter(formatter)
+    stdout_sh = logging.StreamHandler(sys.stdout)
+    stdout_sh.setLevel(level)
+    stdout_sh.setFormatter(formatter)
+
+    # Configure logger and add handlers
+    root_logger = logging.getLogger()
+    root_logger.setLevel(logging.DEBUG)
-    root_logger.setLevel(logging.DEBUG)
+    root_logger.setLevel(level)
-    root_logger.setLevel(logging.DEBUG)
+    root_logger.setLevel(level)
+    root_logger.addHandler(debug_fh)
+    root_logger.addHandler(info_fh)
+    root_logger.addHandler(stdout_sh)
+
+    return root_logger
+
+
+@contextmanager
+def log_duration(task_name: str, logger: logging.Logger):
+    """
+    Log duration of a task.
+    """
+    logger.info(f"{task_name} started")
+    start_time = time.perf_counter()
+    yield
+    duration = time.perf_counter() - start_time
+    logger.info(f"{task_name} finished in {duration:.4f} seconds")
+
+
+def log_array(data: np.ndarray, logger, array_name:str="array") -> None:
+    logger.debug(f"{array_name}")
+    indent = 4*" "
+    logger.debug(f"{indent}Shape: {data.shape}")
+    logger.debug(f"{indent}Min: {np.min(data)}")
+    logger.debug(f"{indent}Max: {np.max(data)}")
+    logger.debug(f"{indent}Average: {np.average(data)}")
+    logger.debug(f"{indent}Std: {np.std(data)}")
+    logger.debug(f"{indent}Value counts: \n{pd.DataFrame(data.flatten()).value_counts()}")
diff --git a/src/segmentmytiff/main.py b/src/segmentmytiff/main.py
@@ -7,11 +7,18 @@
 from sklearn.ensemble import RandomForestClassifier
 
 from segmentmytiff.features import get_features, FeatureType
+from segmentmytiff.logging_config import setup_logger, log_duration, log_array
 from segmentmytiff.utils.io import read_geotiff, save_tiff
 
+logger = setup_logger(__name__)
+
 
 def read_input_and_labels_and_save_predictions(input_path: Path, labels_path: Path, output_path: Path,
-                                               feature_type=FeatureType.IDENTITY, features_path:Path=None) -> None:
+                                               feature_type=FeatureType.IDENTITY, features_path: Path = None) -> None:
+    logger.info("read_input_and_labels_and_save_predictions called with the following arguments:")
+    for k, v in locals().items():
+        logger.info(f"{k}: {v}")
+
     input_data, profile = read_geotiff(input_path)
 
     features = get_features(input_data, input_path, feature_type, features_path, profile)
@@ -30,22 +37,42 @@ def make_predictions(input_data: ndarray, labels: ndarray) -> ndarray:
         :param labels: labels with shape [1, width, height]
     :return: probabilities with shape [class_values, width, height]
     """
-    print(labels.shape)
+    with log_duration("Prepare train data", logger):
+        train_data, train_labels = prepare_training_data(input_data, labels)
+
+    classifier = RandomForestClassifier(n_estimators=100, random_state=0)
+
+    with log_duration("Train model", logger):
+        classifier.fit(train_data, train_labels)
+
+    with log_duration("Make predictions", logger):
+        predictions = classifier.predict_proba(input_data.reshape((input_data.shape[0], -1)).transpose())
+        prediction_map = predictions.transpose().reshape((predictions.shape[1], *input_data.shape[1:]))
+        log_array(prediction_map, logger, array_name="Predictions")
+
+    return prediction_map
+
+
+def prepare_training_data(input_data, labels):
     class1_labels = labels[0]  # Only single class is supported
     flattened = class1_labels.flatten()
     positive_instances = input_data.reshape((input_data.shape[0], -1))[:, flattened == 1].transpose()
     negative_instances = input_data.reshape((input_data.shape[0], -1))[:, flattened == 0].transpose()
-    print('instances:', positive_instances.shape, negative_instances.shape)
-    train_data = np.concatenate((positive_instances, negative_instances))
-    train_labels = np.concatenate(((flattened[flattened == 1]), (flattened[flattened == 0])))
-    print('train data', train_labels.shape, train_data.shape)
-    classifier = RandomForestClassifier(n_estimators=100, random_state=0)
-    classifier.fit(train_data, train_labels)
-    predictions = classifier.predict_proba(input_data.reshape((input_data.shape[0], -1)).transpose())
-    print('predictions', predictions.shape, pd.DataFrame(predictions).value_counts())
-    prediction_map = predictions.transpose().reshape((predictions.shape[1], *input_data.shape[1:]))
-    print('prediction_map shape', prediction_map.shape)
-    return prediction_map
+    n_positive = positive_instances.shape[0]
+    n_negative = negative_instances.shape[0]
+    n_labeled = n_negative + n_positive
+    n_unlabeled = np.prod(labels.shape[-2:]) - n_labeled
+    logger.info(
+        f"{n_labeled} ({round(n_labeled / (n_labeled + n_unlabeled), 2)}%) labeled instances  of a total of {n_labeled + n_unlabeled} instances.")
+    logger.info(
+        f"Training on {n_positive} ({round(100 * n_positive / n_labeled, 2)}%) positive labels and {n_negative} ({round(100 * n_negative / n_labeled, 2)}%) negative labels ")
+    order = np.arange(n_labeled)
+    np.random.shuffle(order)
+    train_data = np.concatenate((positive_instances, negative_instances))[order]
+    train_labels = np.concatenate(((flattened[flattened == 1]), (flattened[flattened == 0])))[order]
+    log_array(train_labels, logger, array_name="Train labels")
+    log_array(train_data, logger, array_name="Train data")
+    return train_data, train_labels
 
 
 def parse_args():

diff --git a/src/segmentmytiff/utils/datasets.py b/src/segmentmytiff/utils/datasets.py
@@ -1,3 +1,4 @@
+import logging
 from pathlib import Path
 
 from PIL import Image
@@ -19,7 +20,7 @@ def image_path_to_mask_path(image_path: Path) -> Path:
         self.masks = [str(image_path_to_mask_path(Path(p))) for p in self.images][:self.limit]
         non_existing_masks = [p for p in self.masks if Path(p).exists() == False]
         if non_existing_masks:
-            print(f"{len(non_existing_masks)} of a total of {len(self.masks)} masks not found.")
+            logging.getLogger().warning(f"{len(non_existing_masks)} of a total of {len(self.masks)} masks not found.")
 
         if self.limit is None:
             self.limit = len(self.images)

diff --git a/src/segmentmytiff/utils/io.py b/src/segmentmytiff/utils/io.py
@@ -1,14 +1,20 @@
+import logging
 from pathlib import Path
 from typing import Any, Union
 
 import numpy as np
 import rasterio
 
+from segmentmytiff.logging_config import log_array
+
+logger = logging.getLogger(__name__)
+
 
 def read_geotiff(input_path: Path) -> (np.ndarray, Any):
     with rasterio.open(input_path) as src:
         data = src.read()
         profile = src.profile
+    log_array(data, logger, array_name=input_path)
     return data, profile
 
 

diff --git a/src/segmentmytiff/utils/train_model.py b/src/segmentmytiff/utils/train_model.py
@@ -9,6 +9,8 @@
 from torchinfo import summary
 from tqdm import tqdm
 
+from segmentmytiff.logging_config import setup_logger
+
 try:
     import mlflow
 
@@ -21,6 +23,8 @@
 from segmentmytiff.utils.performance_metrics import dice_coefficient
 
 
+logger = setup_logger(__name__)
+
 def main(root_path, use_mlflow=True, train_set_limit=None, epochs=10, model_scale=1):
     run_name = f"flair_toy_ep{epochs}_scale{str(model_scale).replace('.','_')}{'_tslim_'+str(train_set_limit) if train_set_limit is not None else ''}"
 
@@ -90,13 +94,13 @@ def train(model, train_dataloader, validation_dataloader, criterion, optimizer,
         train_loss, train_dice = train_one_step(model, train_dataloader, optimizer, criterion, device)
         val_loss, val_dice = validate(model, validation_dataloader, criterion, device)
 
-        print("-" * 30)
-        print(f"Training Loss EPOCH {epoch}: {train_loss:.4f}")
-        print(f"Training DICE EPOCH {epoch}: {train_dice:.4f}")
-        print("\n")
-        print(f"Validation Loss EPOCH {epoch}: {val_loss:.4f}")
-        print(f"Validation DICE EPOCH {epoch}: {val_dice:.4f}")
-        print("-" * 30)
+        logger.info("-" * 30)
+        logger.info(f"Training Loss EPOCH {epoch}: {train_loss:.4f}")
+        logger.info(f"Training DICE EPOCH {epoch}: {train_dice:.4f}")
+        logger.info("\n")
+        logger.info(f"Validation Loss EPOCH {epoch}: {val_loss:.4f}")
+        logger.info(f"Validation DICE EPOCH {epoch}: {val_dice:.4f}")
+        logger.info("-" * 30)
 
         metrics = {
             "train_loss": train_loss,