Skip to content

Commit

Permalink
RLE compression
Browse files Browse the repository at this point in the history
I initially experimented with LZ4 compression due to its speed.  In
Gangnam Style, for example, LZ4 reduced the file size by 49%, but
didn't noticeably improve overall throughput.

The LZ4 library needs memory, as much as 8kB (significant for a
microcontroller), and wants to write all output to a buffer in memory.
I believe it is the overhead of copying memory around that kills the
benefit of reducing the network traffic.

However, a simple RLE compression scheme reduced the file size by only
36%, but boosted throughput from 3.0Mbps to 5.1Mbps.  The RLE
decompression algorithm can operate on just a few bytes of extra
memory and can output directly to SRAM over serial, so it is more
efficient overall.
  • Loading branch information
joeyparrish committed Oct 3, 2024
1 parent 8fd7257 commit 2786ce3
Show file tree
Hide file tree
Showing 8 changed files with 293 additions and 6 deletions.
93 changes: 93 additions & 0 deletions common/rle-common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// Kinetoscope: A Sega Genesis Video Player
//
// Copyright (c) 2024 Joey Parrish
//
// See MIT License in LICENSE.txt

// Shared RLE code.

static int _rle_control_byte = -1;
static int _rle_more_literals = 0;

static int _rle_output_literals(const uint8_t* data, int bytes);
static void _rle_process(uint8_t control_byte, const uint8_t* data, int bytes);

#if !defined(MIN)
# define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif

/**
* Requires these macros:
*
* #define SRAM_WRITE(buffer, size)
*/
static void rle_to_sram(const uint8_t* buffer, int bytes) {
if (_rle_control_byte != -1) {
// Now we have the data to process this cached control byte from another
// callback.
int control_byte = _rle_control_byte;
_rle_control_byte = -1; // clear cache
_rle_process(control_byte, buffer, bytes);
} else if (_rle_more_literals) {
int consumed = _rle_output_literals(buffer, bytes);
buffer += consumed;
bytes -= consumed;
}

if (bytes) {
_rle_process(buffer[0], buffer + 1, bytes - 1);
}
}

static void rle_reset() {
_rle_control_byte = -1;
_rle_more_literals = 0;
}

static int _rle_output_literals(const uint8_t* data, int bytes) {
// We are still copying literal bytes to the output stream.
int available = MIN(_rle_more_literals, bytes);
SRAM_WRITE(data, available);
_rle_more_literals -= available;
return available;
}

static void _rle_process(uint8_t control_byte, const uint8_t* data, int bytes) {
while (true) {
bool repeat = control_byte & 0x80;
int size = control_byte & 0x7f;

if (repeat) {
if (!bytes) {
// We don't have the byte to repeat.
// Save the control byte for next time and return.
_rle_control_byte = control_byte;
return;
}

// Output the next byte |size| times.
for (int i = 0; i < size; ++i) {
SRAM_WRITE(data, 1);
}

// Consume that byte.
data++;
bytes--;
} else {
_rle_more_literals = size;
int consumed = _rle_output_literals(data, bytes);
data += consumed;
bytes -= consumed;
}

if (!bytes) {
// Nothing left in our input.
return;
}

// Set up the next control byte.
control_byte = data[0];
data++;
bytes--;
}
}
11 changes: 10 additions & 1 deletion emulator-patches/kinetoscope.c
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,15 @@ static void write_sram(uint32_t offset, const uint8_t* data, uint32_t size);
// Defines sram_march_test()
#include "../common/sram-common.h"

// Macros for rle-common.h
#define SRAM_WRITE(buffer, size) { \
write_sram(global_sram_offset, buffer, size); \
global_sram_offset += size; \
}

// Defines rle_to_sram()
#include "../common/rle-common.h"


// Current time in milliseconds.
static uint64_t ms_now() {
Expand Down Expand Up @@ -168,7 +177,7 @@ static void write_error_to_sram() {
// Writes HTTP data to SRAM.
static size_t http_data_to_sram(char* data, size_t size, size_t n, void* ctx) {
if (global_compressed) {
return -1; // FIXME
rle_to_sram(data, size * n);
} else {
write_sram(global_sram_offset, (const uint8_t*)data, size * n);
global_sram_offset += size * n;
Expand Down
1 change: 1 addition & 0 deletions encoder/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__pycache__
8 changes: 7 additions & 1 deletion encoder/encode_sega_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
import sys
import tempfile

from rle_encoder import rle_compress


# A "magic" string in the file header to identify it.
FILE_MAGIC = b"what nintendon't"
Expand All @@ -44,6 +46,7 @@

# Compression constants.
COMPRESSION_NONE = 0
COMPRESSION_RLE = 1

def main(args):
if args.generate_resource_file and args.compressed:
Expand Down Expand Up @@ -675,6 +678,9 @@ def compress(compression, uncompressed):
if compression == COMPRESSION_NONE:
return uncompressed

if compression == COMPRESSION_RLE:
return rle_compress(uncompressed)

raise RuntimeError('Unrecognized compression constant')


Expand Down Expand Up @@ -737,7 +743,7 @@ def generate_final_output(args, frame_dir, sound_dir, thumb_dir):

f.write(bytes(128)) # relative URL, filled in for catalog later

compression = COMPRESSION_NONE
compression = COMPRESSION_RLE if args.compressed else COMPRESSION_NONE
f.write(compression.to_bytes(2, 'big'))

f.write(bytes(696)) # Padding/unused
Expand Down
100 changes: 100 additions & 0 deletions encoder/rle_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# Kinetoscope: A Sega Genesis Video Player
#
# Copyright (c) 2024 Joey Parrish
#
# See MIT License in LICENSE.txt

# The Kinetoscope RLE format is a byte-based sequence of commands.
#
# The first byte of a command is a control byte.
#
# The top bit (mask 0x80) is the type, and the bottom 7 bits (mask 0x7f)
# are a size field. The meaning of size depends on the type.
#
# type (mask 0x80):
# 0x00: literal bytes follow, |size| of them, copy directly to output
# 0x80: a single byte follows, repeat |size| times in the output


import os


# How many repeated bytes we need to make compression worth it. Anything more
# than 2 is technically "worth it" for those bytes themselves, but more
# frequent small repeats means more frequent non-repeating literal sequences,
# too, which increases the overhead for those. So this is a balancing act.
MIN_REPEAT_FOR_COMPRESSION = 8

# This is the largest number that fits in the size field. We can't repeat more
# times than this per command, nor output more literal bytes in a row than
# this.
MAX_SIZE_FIELD = 127

# Constants for the type bit.
TYPE_LITERAL = 0x00
TYPE_REPEAT = 0x80


def _count_repeats(block, offset):
original_offset = offset
while offset < len(block) and block[offset] == block[original_offset]:
offset += 1
# Offset is now the number of repeats.
return offset - original_offset


def rle_compress(block):
# The compressed output.
output = b''

# A buffer of literals to be flushed later.
literals = b''

def flush_buffered_literals():
# Take these from the outer scope
nonlocal literals, output

offset = 0
while offset < len(literals):
# Don't output more at once than fits in this size field
literal_block_size = min(len(literals) - offset, MAX_SIZE_FIELD)
literal_block = literals[offset:offset+literal_block_size]
offset += literal_block_size

control_byte = TYPE_LITERAL | literal_block_size
output += control_byte.to_bytes(1, 'big')
output += literal_block

literals = b''

def compress_repeats(data, count):
# Take these from the outer scope
nonlocal output

while count:
# Don't output more at once than fits in this size field
repeat_count = min(count, MAX_SIZE_FIELD)
count -= repeat_count

control_byte = TYPE_REPEAT | repeat_count
output += control_byte.to_bytes(1, 'big')
output += data

i = 0
while i < len(block):
count = _count_repeats(block, i)
these_bytes = block[i:i+count]
i += count

if count < MIN_REPEAT_FOR_COMPRESSION:
# Buffer literals for later
literals += these_bytes
else:
# Flush buffered literals first
flush_buffered_literals()
# Compress repeated sequence
compress_repeats(these_bytes[0:1], count)

# Flush any remaining buffered literals
flush_buffered_literals()
return output
26 changes: 24 additions & 2 deletions firmware/firmware.ino
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@

#define NETWORK_TIMEOUT_SECONDS 30

// Macro required by rle-common.h:
#define SRAM_WRITE(buffer, size) sram_write(buffer, size)
#include "rle-common.h"

// Allocate a second 8kB stack for the second core.
// https://github.com/earlephilhower/arduino-pico/blob/master/docs/multicore.rst
bool core1_separate_stack = true;
Expand Down Expand Up @@ -72,7 +76,9 @@ static uint8_t* fetch_buffer = NULL;
static int fetch_buffer_size = 0;
static SegaVideoIndex video_index;

static bool network_connected = false;
// Also read by speed tests
bool network_connected = false;

static int chunk_size = 0;
static int total_chunks = 0;
static bool is_compressed = false;
Expand Down Expand Up @@ -138,6 +144,22 @@ bool http_sram_callback(const uint8_t* buffer, int bytes) {
return true;
}

// Also called by speed tests
bool http_rle_sram_callback(const uint8_t* buffer, int bytes) {
// Check for interrupt.
if (second_core_interrupt) {
return false;
}

rle_to_sram(buffer, bytes);
return true;
}

// Also called by speed tests
void http_rle_reset() {
rle_reset();
}

static bool http_buffer_callback(const uint8_t* buffer, int bytes) {
// Check for interrupt.
if (second_core_interrupt) {
Expand Down Expand Up @@ -179,7 +201,7 @@ static bool fetch_into_buffer(void* buffer, const char* path,
static bool fetch_into_sram(const char* path, int start_byte = 0,
int size = MAX_FETCH_SIZE,
bool decompress = false) {
fetch_callback = http_sram_callback;
fetch_callback = decompress ? http_rle_sram_callback : http_sram_callback;
return fetch_generic(path, start_byte, size);
}

Expand Down
1 change: 1 addition & 0 deletions firmware/rle-common.h
Loading

0 comments on commit 2786ce3

Please sign in to comment.