RLE compression

I initially experimented with LZ4 compression due to its speed. In Gangnam Style, for example, LZ4 reduced the file size by 49%, and improved overall throughput from 3.0Mbps to 3.6Mbps. However, a simple RLE compression scheme reduced the file size by 54%, and boosted throughput from 3.0Mbps to 5.1Mbps. The LZ4 library also needs memory, as much as 8kB (significant for a microcontroller), and wants to write all output to a buffer in memory, but the RLE decompression algorithm can operate on just a few bytes of extra memory and can output directly to SRAM over serial.
joeyparrish · Oct 3, 2024 · 024631e · 024631e
1 parent b3ff129
commit 024631e
Show file tree

Hide file tree

Showing 8 changed files with 293 additions and 6 deletions.
diff --git a/common/rle-common.h b/common/rle-common.h
@@ -0,0 +1,93 @@
+// Kinetoscope: A Sega Genesis Video Player
+//
+// Copyright (c) 2024 Joey Parrish
+//
+// See MIT License in LICENSE.txt
+
+// Shared RLE code.
+
+static int _rle_control_byte = -1;
+static int _rle_more_literals = 0;
+
+static int _rle_output_literals(const uint8_t* data, int bytes);
+static void _rle_process(uint8_t control_byte, const uint8_t* data, int bytes);
+
+#if !defined(MIN)
+# define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+/**
+ * Requires these macros:
+ *
+ * #define SRAM_WRITE(buffer, size)
+ */
+static void rle_to_sram(const uint8_t* buffer, int bytes) {
+  if (_rle_control_byte != -1) {
+    // Now we have the data to process this cached control byte from another
+    // callback.
+    int control_byte = _rle_control_byte;
+    _rle_control_byte = -1;  // clear cache
+    _rle_process(control_byte, buffer, bytes);
+  } else if (_rle_more_literals) {
+    int consumed = _rle_output_literals(buffer, bytes);
+    buffer += consumed;
+    bytes -= consumed;
+  }
+
+  if (bytes) {
+    _rle_process(buffer[0], buffer + 1, bytes - 1);
+  }
+}
+
+static void rle_reset() {
+  _rle_control_byte = -1;
+  _rle_more_literals = 0;
+}
+
+static int _rle_output_literals(const uint8_t* data, int bytes) {
+  // We are still copying literal bytes to the output stream.
+  int available = MIN(_rle_more_literals, bytes);
+  SRAM_WRITE(data, available);
+  _rle_more_literals -= available;
+  return available;
+}
+
+static void _rle_process(uint8_t control_byte, const uint8_t* data, int bytes) {
+  while (true) {
+    bool repeat = control_byte & 0x80;
+    int size = control_byte & 0x7f;
+
+    if (repeat) {
+      if (!bytes) {
+        // We don't have the byte to repeat.
+        // Save the control byte for next time and return.
+        _rle_control_byte = control_byte;
+        return;
+      }
+
+      // Output the next byte |size| times.
+      for (int i = 0; i < size; ++i) {
+        SRAM_WRITE(data, 1);
+      }
+
+      // Consume that byte.
+      data++;
+      bytes--;
+    } else {
+      _rle_more_literals = size;
+      int consumed = _rle_output_literals(data, bytes);
+      data += consumed;
+      bytes -= consumed;
+    }
+
+    if (!bytes) {
+      // Nothing left in our input.
+      return;
+    }
+
+    // Set up the next control byte.
+    control_byte = data[0];
+    data++;
+    bytes--;
+  }
+}
diff --git a/emulator-patches/kinetoscope.c b/emulator-patches/kinetoscope.c
@@ -106,6 +106,15 @@ static void write_sram(uint32_t offset, const uint8_t* data, uint32_t size);
 // Defines sram_march_test()
 #include "../common/sram-common.h"
 
+// Macros for rle-common.h
+#define SRAM_WRITE(buffer, size) { \
+  write_sram(global_sram_offset, buffer, size); \
+  global_sram_offset += size; \
+}
+
+// Defines rle_to_sram()
+#include "../common/rle-common.h"
+
 
 // Current time in milliseconds.
 static uint64_t ms_now() {
@@ -168,7 +177,7 @@ static void write_error_to_sram() {
 // Writes HTTP data to SRAM.
 static size_t http_data_to_sram(char* data, size_t size, size_t n, void* ctx) {
   if (global_compressed) {
-    return -1;  // FIXME
+    rle_to_sram(data, size * n);
   } else {
     write_sram(global_sram_offset, (const uint8_t*)data, size * n);
     global_sram_offset += size * n;

diff --git a/encoder/.gitignore b/encoder/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/encoder/encode_sega_video.py b/encoder/encode_sega_video.py
@@ -22,6 +22,8 @@
 import sys
 import tempfile
 
+from rle_encoder import rle_compress
+
 
 # A "magic" string in the file header to identify it.
 FILE_MAGIC = b"what nintendon't"
@@ -44,6 +46,7 @@
 
 # Compression constants.
 COMPRESSION_NONE = 0
+COMPRESSION_RLE = 1
 
 def main(args):
   if args.generate_resource_file and args.compressed:
@@ -675,6 +678,9 @@ def compress(compression, uncompressed):
   if compression == COMPRESSION_NONE:
     return uncompressed
 
+  if compression == COMPRESSION_RLE:
+    return rle_compress(uncompressed)
+
   raise RuntimeError('Unrecognized compression constant')
 
 
@@ -737,7 +743,7 @@ def generate_final_output(args, frame_dir, sound_dir, thumb_dir):
 
       f.write(bytes(128)) # relative URL, filled in for catalog later
 
-      compression = COMPRESSION_NONE
+      compression = COMPRESSION_RLE if args.compressed else COMPRESSION_NONE
       f.write(compression.to_bytes(2, 'big'))
 
       f.write(bytes(696)) # Padding/unused

diff --git a/encoder/rle_encoder.py b/encoder/rle_encoder.py
@@ -0,0 +1,100 @@
+# Kinetoscope: A Sega Genesis Video Player
+#
+# Copyright (c) 2024 Joey Parrish
+#
+# See MIT License in LICENSE.txt
+
+# The Kinetoscope RLE format is a byte-based sequence of commands.
+#
+# The first byte of a command is a control byte.
+#
+# The top bit (mask 0x80) is the type, and the bottom 7 bits (mask 0x7f)
+# are a size field.  The meaning of size depends on the type.
+#
+# type (mask 0x80):
+#   0x00: literal bytes follow, |size| of them, copy directly to output
+#   0x80: a single byte follows, repeat |size| times in the output
+
+
+import os
+
+
+# How many repeated bytes we need to make compression worth it.  Anything more
+# than 2 is technically "worth it" for those bytes themselves, but more
+# frequent small repeats means more frequent non-repeating literal sequences,
+# too, which increases the overhead for those.  So this is a balancing act.
+MIN_REPEAT_FOR_COMPRESSION = 8
+
+# This is the largest number that fits in the size field.  We can't repeat more
+# times than this per command, nor output more literal bytes in a row than
+# this.
+MAX_SIZE_FIELD = 127
+
+# Constants for the type bit.
+TYPE_LITERAL = 0x00
+TYPE_REPEAT = 0x80
+
+
+def _count_repeats(block, offset):
+  original_offset = offset
+  while offset < len(block) and block[offset] == block[original_offset]:
+    offset += 1
+  # Offset is now the number of repeats.
+  return offset - original_offset
+
+
+def rle_compress(block):
+  # The compressed output.
+  output = b''
+
+  # A buffer of literals to be flushed later.
+  literals = b''
+
+  def flush_buffered_literals():
+    # Take these from the outer scope
+    nonlocal literals, output
+
+    offset = 0
+    while offset < len(literals):
+      # Don't output more at once than fits in this size field
+      literal_block_size = min(len(literals) - offset, MAX_SIZE_FIELD)
+      literal_block = literals[offset:offset+literal_block_size]
+      offset += literal_block_size
+
+      control_byte = TYPE_LITERAL | literal_block_size
+      output += control_byte.to_bytes(1, 'big')
+      output += literal_block
+
+    literals = b''
+
+  def compress_repeats(data, count):
+    # Take these from the outer scope
+    nonlocal output
+
+    while count:
+      # Don't output more at once than fits in this size field
+      repeat_count = min(count, MAX_SIZE_FIELD)
+      count -= repeat_count
+
+      control_byte = TYPE_REPEAT | repeat_count
+      output += control_byte.to_bytes(1, 'big')
+      output += data
+
+  i = 0
+  while i < len(block):
+    count = _count_repeats(block, i)
+    this_byte = block[i:i+1]  # Still bytes type, not int as block[i] would be
+    i += count
+
+    if count < MIN_REPEAT_FOR_COMPRESSION:
+      # Buffer literals for later
+      literals += this_byte
+    else:
+      # Flush buffered literals first
+      flush_buffered_literals()
+      # Compress repeated sequence
+      compress_repeats(this_byte, count)
+
+  # Flush any remaining buffered literals
+  flush_buffered_literals()
+  return output
diff --git a/firmware/firmware.ino b/firmware/firmware.ino
@@ -45,6 +45,10 @@
 
 #define NETWORK_TIMEOUT_SECONDS 30
 
+// Macro required by rle-common.h:
+#define SRAM_WRITE(buffer, size) sram_write(buffer, size)
+#include "rle-common.h"
+
 // Allocate a second 8kB stack for the second core.
 // https://github.com/earlephilhower/arduino-pico/blob/master/docs/multicore.rst
 bool core1_separate_stack = true;
@@ -72,7 +76,9 @@ static uint8_t* fetch_buffer = NULL;
 static int fetch_buffer_size = 0;
 static SegaVideoIndex video_index;
 
-static bool network_connected = false;
+// Also read by speed tests
+bool network_connected = false;
+
 static int chunk_size = 0;
 static int total_chunks = 0;
 static bool is_compressed = false;
@@ -138,6 +144,22 @@ bool http_sram_callback(const uint8_t* buffer, int bytes) {
   return true;
 }
 
+// Also called by speed tests
+bool http_rle_sram_callback(const uint8_t* buffer, int bytes) {
+  // Check for interrupt.
+  if (second_core_interrupt) {
+    return false;
+  }
+
+  rle_to_sram(buffer, bytes);
+  return true;
+}
+
+// Also called by speed tests
+void http_rle_reset() {
+  rle_reset();
+}
+
 static bool http_buffer_callback(const uint8_t* buffer, int bytes) {
   // Check for interrupt.
   if (second_core_interrupt) {
@@ -179,7 +201,7 @@ static bool fetch_into_buffer(void* buffer, const char* path,
 static bool fetch_into_sram(const char* path, int start_byte = 0,
                             int size = MAX_FETCH_SIZE,
                             bool decompress = false) {
-  fetch_callback = http_sram_callback;
+  fetch_callback = decompress ? http_rle_sram_callback : http_sram_callback;
   return fetch_generic(path, start_byte, size);
 }
 

diff --git a/firmware/rle-common.h b/firmware/rle-common.h
@@ -0,0 +1 @@
+../common/rle-common.h