diff --git a/target/linux/realtek/image/rt-loader/Makefile b/target/linux/realtek/image/rt-loader/Makefile
new file mode 100644
index 00000000000..6479db705c8
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/Makefile
@@ -0,0 +1,98 @@
+# rt-loader make file
+# (c) 2025 Markus Stockhausen
+#
+# This is the make file for the rt-loader (aka runtime or realtek loader). It tries to
+# avoid copying files around where possible. Therefore it is controlled by the following
+# input parameters
+#
+# KERNEL_IMG_IN:	The filename of an LZMA compressed kernel image. This is required
+# KERNEL_IMG_OUT:	The filename of the kernel image with the rt-loader prepended.
+#			If not given it will be created as image.bin into the BUILD_DIR.
+# BUILD_DIR: 		The temporary build dir. If not given it will be set to "build".
+#
+# To add it into the OpenWrt toolchain just create two new build commands
+#
+# define Build/rt-loader
+#   $(MAKE) all clean -C rt-loader CROSS_COMPILE="$(TARGET_CROSS)" \
+#	    KERNEL_IMG_IN="$@" KERNEL_IMG_OUT="$@.new" BUILD_DIR="$@.build"
+#   mv "$@.new" "$@"
+# endef
+#
+# define Build/rt-compress
+#   $(STAGING_DIR_HOST)/bin/xz --format=lzma -9 --stdout "$@" > "$@.new"
+#   mv "$@.new" "$@"
+# endef
+#
+# Use them in a new kernel build recipe
+#
+# define Device/uimage-rt-loader
+#   KERNEL/rt-loader := kernel-bin | append-dtb | rt-compress | rt-loader
+#   KERNEL := $$(KERNEL/rt-loader) | uImage none
+#   KERNEL_INITRAMFS := $$(KERNEL/rt-loader) | uImage none
+# endef
+#
+# And finally add it to the target device. E.g.
+#
+# define Device/linksys_lgs310c
+#   $(Device/uimage-rt-loader)
+#   ...
+# endef
+
+CC		:= $(CROSS_COMPILE)gcc
+LD		:= $(CROSS_COMPILE)ld
+OBJCOPY		:= $(CROSS_COMPILE)objcopy
+OBJDUMP		:= $(CROSS_COMPILE)objdump
+
+CFLAGS		= -fpic -mabicalls -O2 -fno-builtin-printf -Iinclude
+
+ASFLAGS		= -fpic -msoft-float -Iinclude
+
+LDFLAGS		= -static -nostdlib -T linker/linker.ld --no-warn-mismatch
+
+O_FORMAT 	= $(shell $(OBJDUMP) -i | head -2 | grep elf32)
+
+SOURCES		= src/startup.S src/main.c src/board.c src/memory.c src/unlzma.c
+
+BUILD_DIR	?= build
+
+IMAGE_OBJ	:= $(BUILD_DIR)/image.o
+IMAGE_ELF     	:= $(BUILD_DIR)/image.elf
+
+KERNEL_IMG_OUT	?= $(BUILD_DIR)/image.bin
+
+OBJECTS_C	= $(filter %.c,$(SOURCES))
+OBJECTS_S	= $(filter %.S,$(SOURCES))
+
+OBJECTS		:= $(OBJECTS_S:.S=.o) $(OBJECTS_C:.c=.o)
+OBJECTS		:= $(patsubst %.o, $(BUILD_DIR)/%.o, $(OBJECTS)) $(IMAGE_OBJ)
+
+ifneq ($(MAKECMDGOALS),clean)
+ifndef KERNEL_IMG_IN
+$(error Compressed kernel image not given via KERNEL_IMG_IN)
+endif
+endif
+
+all: $(KERNEL_IMG_OUT)
+
+install:
+
+$(BUILD_DIR)/%.o : %.c
+	@mkdir -p $(dir $@)
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+$(BUILD_DIR)/%.o : %.S
+	@mkdir -p $(dir $@)
+	$(CC) $(ASFLAGS) -c -o $@ $<
+
+$(IMAGE_OBJ): $(KERNEL_IMG_IN)
+	$(OBJCOPY) -I binary -O $(O_FORMAT) --rename-section .data=.kernel $< $@
+
+$(IMAGE_ELF): $(OBJECTS)
+	$(LD) $(LDFLAGS) -o $@ $(OBJECTS)
+
+$(KERNEL_IMG_OUT): $(IMAGE_ELF)
+	$(OBJCOPY) -O binary $< $@
+
+clean:
+	rm -rf $(BUILD_DIR)/
+
diff --git a/target/linux/realtek/image/rt-loader/include/board.h b/target/linux/realtek/image/rt-loader/include/board.h
new file mode 100644
index 00000000000..b0d0945890a
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/include/board.h
@@ -0,0 +1,14 @@
+/*
+ * rt-loader header
+ * (c) 2025 Markus Stockhausen
+ */
+
+#ifndef _BOARD_H_
+#define _BOARD_H_
+
+unsigned int board_get_memory(void);
+void board_get_system(char *buffer, int len);
+void board_panic(void);
+void board_putchar(int ch, void *ctx);;
+
+#endif  // _BOARD_H_
diff --git a/target/linux/realtek/image/rt-loader/include/globals.h b/target/linux/realtek/image/rt-loader/include/globals.h
new file mode 100644
index 00000000000..49052b81559
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/include/globals.h
@@ -0,0 +1,17 @@
+/*
+ * rt-loader header
+ * (c) 2025 Markus Stockhausen
+ */
+
+#ifndef _GLOBALS_H_
+#define _GLOBALS_H_
+
+#define KSEG0			0x80000000
+#define STACK_SIZE		0x10000
+#define HEAP_SIZE		0x40000
+#define MEMORY_ALIGNMENT	32
+
+#define printf(fmt, ...)	npf_pprintf(board_putchar, NULL, fmt, ##__VA_ARGS__)
+#define snprintf		npf_snprintf
+
+#endif  // _GLOBALS_H_
diff --git a/target/linux/realtek/image/rt-loader/include/memory.h b/target/linux/realtek/image/rt-loader/include/memory.h
new file mode 100644
index 00000000000..80d0f8a2835
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/include/memory.h
@@ -0,0 +1,30 @@
+/*
+ * rt-loader header
+ * (c) 2025 Markus Stockhausen
+ */
+
+#ifndef _MEMORY_H_
+#define _MEMORY_H_
+
+#include <stddef.h>
+#include "globals.h"
+
+#define CACHE_HIT_INVALIDATE_I		0x10
+#define CACHE_HIT_WRITEBACK_INV_D	0x15
+
+#define ioread32(reg)			(*(volatile int *)(reg))
+#define iowrite32(val, reg)		(*(volatile int *)(reg) = val)
+
+void flush_cache(void *start_addr, unsigned long size);
+void free(void *ptr);
+void *malloc(size_t size);
+int memcmp(const void *s1, const void *s2, size_t count);
+void *memmove(void *dst, const void *src, size_t count);
+void *memcpy(void *dst, const void *src, size_t count);
+void *memset(void *dst, int value, size_t count);
+size_t strlen(const char *s);
+
+extern void *_heap_addr;
+extern void *_heap_addr_max;
+
+#endif  // _MEMORY_H_
diff --git a/target/linux/realtek/image/rt-loader/include/nanoprintf.h b/target/linux/realtek/image/rt-loader/include/nanoprintf.h
new file mode 100644
index 00000000000..a415ad9f0df
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/include/nanoprintf.h
@@ -0,0 +1,1203 @@
+/* nanoprintf v0.5.5: a tiny embeddable printf replacement written in C.
+   https://github.com/charlesnicholson/nanoprintf
+   charles.nicholson+nanoprintf@gmail.com
+   dual-licensed under 0bsd and unlicense, take your pick. see eof for details. */
+
+#ifndef NPF_H_INCLUDED
+#define NPF_H_INCLUDED
+
+#include <stdarg.h>
+#include <stddef.h>
+
+// Define this to fully sandbox nanoprintf inside of a translation unit.
+#ifdef NANOPRINTF_VISIBILITY_STATIC
+  #define NPF_VISIBILITY static
+#else
+  #define NPF_VISIBILITY extern
+#endif
+
+#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
+  #define NPF_PRINTF_ATTR(FORMAT_INDEX, VARGS_INDEX) \
+    __attribute__((format(printf, FORMAT_INDEX, VARGS_INDEX)))
+#else
+  #define NPF_PRINTF_ATTR(FORMAT_INDEX, VARGS_INDEX)
+#endif
+
+// Public API
+
+#ifdef __cplusplus
+#define NPF_RESTRICT
+extern "C" {
+#else
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+#define NPF_RESTRICT restrict
+#else
+#define NPF_RESTRICT
+#endif
+#endif
+
+// The npf_ functions all return the number of bytes required to express the
+// fully-formatted string, not including the null terminator character.
+// The npf_ functions do not return negative values, since the lack of 'l' length
+// modifier support makes encoding errors impossible.
+
+NPF_VISIBILITY int npf_snprintf(char * NPF_RESTRICT buffer,
+                                size_t bufsz,
+                                const char * NPF_RESTRICT format,
+                                ...) NPF_PRINTF_ATTR(3, 4);
+
+NPF_VISIBILITY int npf_vsnprintf(char * NPF_RESTRICT buffer,
+                                 size_t bufsz,
+                                 char const * NPF_RESTRICT format,
+                                 va_list vlist)   NPF_PRINTF_ATTR(3, 0);
+
+typedef void (*npf_putc)(int c, void *ctx);
+NPF_VISIBILITY int npf_pprintf(npf_putc pc,
+                               void * NPF_RESTRICT pc_ctx,
+                               char const * NPF_RESTRICT format,
+                               ...) NPF_PRINTF_ATTR(3, 4);
+
+NPF_VISIBILITY int npf_vpprintf(npf_putc pc,
+                                void * NPF_RESTRICT pc_ctx,
+                                char const * NPF_RESTRICT format,
+                                va_list vlist) NPF_PRINTF_ATTR(3, 0);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NPF_H_INCLUDED
+
+/* The implementation of nanoprintf begins here, to be compiled only if
+   NANOPRINTF_IMPLEMENTATION is defined. In a multi-file library what follows would
+   be nanoprintf.c. */
+
+#ifdef NANOPRINTF_IMPLEMENTATION
+
+#ifndef NPF_IMPLEMENTATION_INCLUDED
+#define NPF_IMPLEMENTATION_INCLUDED
+
+#include <limits.h>
+#include <stdint.h>
+
+// The conversion buffer must fit at least UINT64_MAX in octal format with the leading '0'.
+#ifndef NANOPRINTF_CONVERSION_BUFFER_SIZE
+  #define NANOPRINTF_CONVERSION_BUFFER_SIZE    23
+#endif
+#if NANOPRINTF_CONVERSION_BUFFER_SIZE < 23
+  #error The size of the conversion buffer must be at least 23 bytes.
+#endif
+
+// Pick reasonable defaults if nothing's been configured.
+#if !defined(NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS) && \
+    !defined(NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS) && \
+    !defined(NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS) && \
+    !defined(NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS) && \
+    !defined(NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS) && \
+    !defined(NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS) && \
+    !defined(NANOPRINTF_USE_WRITEBACK_FORMAT_SPECIFIERS) && \
+    !defined(NANOPRINTF_USE_ALT_FORM_FLAG)
+  #define NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS 1
+  #define NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS 1
+  #define NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS 1
+  #define NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS 0
+  #define NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS 1
+  #define NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS 0
+  #define NANOPRINTF_USE_WRITEBACK_FORMAT_SPECIFIERS 0
+  #define NANOPRINTF_USE_ALT_FORM_FLAG 1
+#endif
+
+// If anything's been configured, everything must be configured.
+#ifndef NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS
+  #error NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS must be #defined to 0 or 1
+#endif
+#ifndef NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS
+  #error NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS must be #defined to 0 or 1
+#endif
+#ifndef NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS
+  #error NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS must be #defined to 0 or 1
+#endif
+#ifndef NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS
+  #error NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS must be #defined to 0 or 1
+#endif
+#ifndef NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS
+  #error NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS must be #defined to 0 or 1
+#endif
+#ifndef NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS
+  #error NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS must be #defined to 0 or 1
+#endif
+#ifndef NANOPRINTF_USE_WRITEBACK_FORMAT_SPECIFIERS
+  #error NANOPRINTF_USE_WRITEBACK_FORMAT_SPECIFIERS must be #defined to 0 or 1
+#endif
+
+// Ensure flags are compatible.
+#if (NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS == 1) && \
+    (NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 0)
+  #error Precision format specifiers must be enabled if float support is enabled.
+#endif
+
+// intmax_t / uintmax_t require stdint from c99 / c++11
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+  #ifndef _MSC_VER
+    #ifdef __cplusplus
+      #if __cplusplus < 201103L
+        #error large format specifier support requires C++11 or later.
+      #endif
+    #else
+      #if __STDC_VERSION__ < 199409L
+        #error nanoprintf requires C99 or later.
+      #endif
+    #endif
+  #endif
+#endif
+
+// Figure out if we can disable warnings with pragmas.
+#ifdef __clang__
+  #define NPF_CLANG 1
+  #define NPF_GCC_PAST_4_6 0
+#else
+  #define NPF_CLANG 0
+  #if defined(__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
+    #define NPF_GCC_PAST_4_6 1
+  #else
+    #define NPF_GCC_PAST_4_6 0
+  #endif
+#endif
+
+#if NPF_CLANG || NPF_GCC_PAST_4_6
+  #define NPF_HAVE_GCC_WARNING_PRAGMAS 1
+#else
+  #define NPF_HAVE_GCC_WARNING_PRAGMAS 0
+#endif
+
+#if NPF_HAVE_GCC_WARNING_PRAGMAS
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wunused-function"
+  #pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
+  #ifdef __cplusplus
+    #pragma GCC diagnostic ignored "-Wold-style-cast"
+  #endif
+  #pragma GCC diagnostic ignored "-Wpadded"
+  #pragma GCC diagnostic ignored "-Wfloat-equal"
+  #if NPF_CLANG
+    #pragma GCC diagnostic ignored "-Wc++98-compat-pedantic"
+    #pragma GCC diagnostic ignored "-Wcovered-switch-default"
+    #pragma GCC diagnostic ignored "-Wdeclaration-after-statement"
+    #pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant"
+    #ifndef __APPLE__
+      #pragma GCC diagnostic ignored "-Wunsafe-buffer-usage"
+    #endif
+  #elif NPF_GCC_PAST_4_6
+    #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+  #endif
+#endif
+
+#ifdef _MSC_VER
+  #pragma warning(push)
+  #pragma warning(disable:4619) // there is no warning number 'number'
+  // C4619 has to be disabled first!
+  #pragma warning(disable:4127) // conditional expression is constant
+  #pragma warning(disable:4505) // unreferenced local function has been removed
+  #pragma warning(disable:4514) // unreferenced inline function has been removed
+  #pragma warning(disable:4701) // potentially uninitialized local variable used
+  #pragma warning(disable:4706) // assignment within conditional expression
+  #pragma warning(disable:4710) // function not inlined
+  #pragma warning(disable:4711) // function selected for inline expansion
+  #pragma warning(disable:4820) // padding added after struct member
+  #pragma warning(disable:5039) // potentially throwing function passed to extern C function
+  #pragma warning(disable:5045) // compiler will insert Spectre mitigation for memory load
+  #pragma warning(disable:5262) // implicit switch fall-through
+  #pragma warning(disable:26812) // enum type is unscoped
+#endif
+
+#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
+  #define NPF_NOINLINE __attribute__((noinline))
+  #define NPF_FORCE_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+  #define NPF_NOINLINE __declspec(noinline)
+  #define NPF_FORCE_INLINE inline __forceinline
+#else
+  #define NPF_NOINLINE
+  #define NPF_FORCE_INLINE
+#endif
+
+#if (NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1) || \
+    (NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1)
+enum {
+  NPF_FMT_SPEC_OPT_NONE,
+  NPF_FMT_SPEC_OPT_LITERAL,
+  NPF_FMT_SPEC_OPT_STAR,
+};
+#endif
+
+enum {
+  NPF_FMT_SPEC_LEN_MOD_NONE,
+#if NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS == 1
+  NPF_FMT_SPEC_LEN_MOD_SHORT,       // 'h'
+  NPF_FMT_SPEC_LEN_MOD_CHAR,        // 'hh'
+#endif
+  NPF_FMT_SPEC_LEN_MOD_LONG,        // 'l'
+  NPF_FMT_SPEC_LEN_MOD_LONG_DOUBLE, // 'L'
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+  NPF_FMT_SPEC_LEN_MOD_LARGE_LONG_LONG, // 'll'
+  NPF_FMT_SPEC_LEN_MOD_LARGE_INTMAX,    // 'j'
+  NPF_FMT_SPEC_LEN_MOD_LARGE_SIZET,     // 'z'
+  NPF_FMT_SPEC_LEN_MOD_LARGE_PTRDIFFT,  // 't'
+#endif
+};
+
+enum {
+  NPF_FMT_SPEC_CONV_NONE,
+  NPF_FMT_SPEC_CONV_PERCENT,      // '%'
+  NPF_FMT_SPEC_CONV_CHAR,         // 'c'
+  NPF_FMT_SPEC_CONV_STRING,       // 's'
+  NPF_FMT_SPEC_CONV_SIGNED_INT,   // 'i', 'd'
+#if NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS == 1
+  NPF_FMT_SPEC_CONV_BINARY,       // 'b'
+#endif
+  NPF_FMT_SPEC_CONV_OCTAL,        // 'o'
+  NPF_FMT_SPEC_CONV_HEX_INT,      // 'x', 'X'
+  NPF_FMT_SPEC_CONV_UNSIGNED_INT, // 'u'
+  NPF_FMT_SPEC_CONV_POINTER,      // 'p'
+#if NANOPRINTF_USE_WRITEBACK_FORMAT_SPECIFIERS == 1
+  NPF_FMT_SPEC_CONV_WRITEBACK,    // 'n'
+#endif
+#if NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS == 1
+  NPF_FMT_SPEC_CONV_FLOAT_DEC,      // 'f', 'F'
+  NPF_FMT_SPEC_CONV_FLOAT_SCI,      // 'e', 'E'
+  NPF_FMT_SPEC_CONV_FLOAT_SHORTEST, // 'g', 'G'
+  NPF_FMT_SPEC_CONV_FLOAT_HEX,      // 'a', 'A'
+#endif
+};
+
+typedef struct npf_format_spec {
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+  int field_width;
+#endif
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+  int prec;
+  uint8_t prec_opt;
+#endif
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+  uint8_t field_width_opt;
+  char left_justified;   // '-'
+  char leading_zero_pad; // '0'
+#endif
+  char prepend;          // ' ' or '+'
+#if NANOPRINTF_USE_ALT_FORM_FLAG == 1
+  char alt_form;         // '#'
+#endif
+  char case_adjust;      // 'a' - 'A' , or 0 (must be non-negative to work)
+  uint8_t length_modifier;
+  uint8_t conv_spec;
+} npf_format_spec_t;
+
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 0
+  typedef long npf_int_t;
+  typedef unsigned long npf_uint_t;
+#else
+  typedef intmax_t npf_int_t;
+  typedef uintmax_t npf_uint_t;
+#endif
+
+typedef struct npf_bufputc_ctx {
+  char *dst;
+  size_t len;
+  size_t cur;
+} npf_bufputc_ctx_t;
+
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+  typedef char npf_size_is_ptrdiff[(sizeof(size_t) == sizeof(ptrdiff_t)) ? 1 : -1];
+  typedef ptrdiff_t npf_ssize_t;
+  typedef size_t npf_uptrdiff_t;
+#endif
+
+#ifdef _MSC_VER
+  #include <intrin.h>
+#endif
+
+#define NPF_MIN(x, y)    ((x) <= (y) ? (x) : (y))
+#define NPF_MAX(x, y)    ((x) >= (y) ? (x) : (y))
+
+static int npf_parse_format_spec(char const *format, npf_format_spec_t *out_spec) {
+  char const *cur = format;
+
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+  out_spec->left_justified = 0;
+  out_spec->leading_zero_pad = 0;
+#endif
+  out_spec->case_adjust = 'a' - 'A'; // lowercase
+  out_spec->prepend = 0;
+#if NANOPRINTF_USE_ALT_FORM_FLAG == 1
+  out_spec->alt_form = 0;
+#endif
+
+  while (*++cur) { // cur points at the leading '%' character
+    switch (*cur) { // Optional flags
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+      case '-': out_spec->left_justified = '-'; out_spec->leading_zero_pad = 0; continue;
+      case '0': out_spec->leading_zero_pad = !out_spec->left_justified; continue;
+#endif
+      case '+': out_spec->prepend = '+'; continue;
+      case ' ': if (out_spec->prepend == 0) { out_spec->prepend = ' '; } continue;
+#if NANOPRINTF_USE_ALT_FORM_FLAG == 1
+      case '#': out_spec->alt_form = '#'; continue;
+#endif
+      default: break;
+    }
+    break;
+  }
+
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+  out_spec->field_width = 0;
+  out_spec->field_width_opt = NPF_FMT_SPEC_OPT_NONE;
+  if (*cur == '*') {
+    out_spec->field_width_opt = NPF_FMT_SPEC_OPT_STAR;
+    ++cur;
+  } else {
+    while ((*cur >= '0') && (*cur <= '9')) {
+      out_spec->field_width_opt = NPF_FMT_SPEC_OPT_LITERAL;
+      out_spec->field_width = (out_spec->field_width * 10) + (*cur++ - '0');
+    }
+  }
+#endif
+
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+  out_spec->prec = 0;
+  out_spec->prec_opt = NPF_FMT_SPEC_OPT_NONE;
+  if (*cur == '.') {
+    ++cur;
+    if (*cur == '*') {
+      out_spec->prec_opt = NPF_FMT_SPEC_OPT_STAR;
+      ++cur;
+    } else {
+      if (*cur == '-') {
+        ++cur;
+      } else {
+        out_spec->prec_opt = NPF_FMT_SPEC_OPT_LITERAL;
+      }
+      while ((*cur >= '0') && (*cur <= '9')) {
+        out_spec->prec = (out_spec->prec * 10) + (*cur++ - '0');
+      }
+    }
+  }
+#endif
+
+  uint_fast8_t tmp_conv = NPF_FMT_SPEC_CONV_NONE;
+  out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_NONE;
+  switch (*cur++) { // Length modifier
+#if NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS == 1
+    case 'h':
+      out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_SHORT;
+      if (*cur == 'h') {
+        out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_CHAR;
+        ++cur;
+      }
+      break;
+#endif
+    case 'l':
+      out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_LONG;
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+      if (*cur == 'l') {
+        out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_LARGE_LONG_LONG;
+        ++cur;
+      }
+#endif
+      break;
+#if NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS == 1
+    case 'L': out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_LONG_DOUBLE; break;
+#endif
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+    case 'j': out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_LARGE_INTMAX; break;
+    case 'z': out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_LARGE_SIZET; break;
+    case 't': out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_LARGE_PTRDIFFT; break;
+#endif
+    default: --cur; break;
+  }
+
+  switch (*cur++) { // Conversion specifier
+    case '%': out_spec->conv_spec = NPF_FMT_SPEC_CONV_PERCENT;
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+      out_spec->prec_opt = NPF_FMT_SPEC_OPT_NONE;
+      out_spec->prec = 0;
+#endif
+      break;
+
+    case 'c': out_spec->conv_spec = NPF_FMT_SPEC_CONV_CHAR;
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+      out_spec->prec_opt = NPF_FMT_SPEC_OPT_NONE;
+      out_spec->prec = 0;
+#endif
+      break;
+
+    case 's': out_spec->conv_spec = NPF_FMT_SPEC_CONV_STRING;
+      break;
+
+    case 'i':
+    case 'd': tmp_conv = NPF_FMT_SPEC_CONV_SIGNED_INT; goto finish;
+    case 'o': tmp_conv = NPF_FMT_SPEC_CONV_OCTAL; goto finish;
+    case 'u': tmp_conv = NPF_FMT_SPEC_CONV_UNSIGNED_INT; goto finish;
+    case 'X': out_spec->case_adjust = 0;
+    case 'x': tmp_conv = NPF_FMT_SPEC_CONV_HEX_INT; goto finish;
+    finish:
+      out_spec->conv_spec = (uint8_t)tmp_conv;
+#if (NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1) && \
+    (NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1)
+      if (out_spec->prec_opt != NPF_FMT_SPEC_OPT_NONE) { out_spec->leading_zero_pad = 0; }
+#endif
+      break;
+
+#if NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS == 1
+    case 'F': out_spec->case_adjust = 0;
+    case 'f':
+      out_spec->conv_spec = NPF_FMT_SPEC_CONV_FLOAT_DEC;
+      if (out_spec->prec_opt == NPF_FMT_SPEC_OPT_NONE) { out_spec->prec = 6; }
+      break;
+
+    case 'E': out_spec->case_adjust = 0;
+    case 'e':
+      out_spec->conv_spec = NPF_FMT_SPEC_CONV_FLOAT_SCI;
+      if (out_spec->prec_opt == NPF_FMT_SPEC_OPT_NONE) { out_spec->prec = 6; }
+      break;
+
+    case 'G': out_spec->case_adjust = 0;
+    case 'g':
+      out_spec->conv_spec = NPF_FMT_SPEC_CONV_FLOAT_SHORTEST;
+      if (out_spec->prec_opt == NPF_FMT_SPEC_OPT_NONE) { out_spec->prec = 6; }
+      break;
+
+    case 'A': out_spec->case_adjust = 0;
+    case 'a':
+      out_spec->conv_spec = NPF_FMT_SPEC_CONV_FLOAT_HEX;
+      if (out_spec->prec_opt == NPF_FMT_SPEC_OPT_NONE) { out_spec->prec = 6; }
+      break;
+#endif
+
+#if NANOPRINTF_USE_WRITEBACK_FORMAT_SPECIFIERS == 1
+    case 'n':
+      // todo: reject string if flags or width or precision exist
+      out_spec->conv_spec = NPF_FMT_SPEC_CONV_WRITEBACK;
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+      out_spec->prec_opt = NPF_FMT_SPEC_OPT_NONE;
+#endif
+      break;
+#endif
+
+    case 'p':
+      out_spec->conv_spec = NPF_FMT_SPEC_CONV_POINTER;
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+      out_spec->prec_opt = NPF_FMT_SPEC_OPT_NONE;
+#endif
+      break;
+
+#if NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS == 1
+    case 'B':
+      out_spec->case_adjust = 0;
+    case 'b':
+      out_spec->conv_spec = NPF_FMT_SPEC_CONV_BINARY;
+      break;
+#endif
+
+    default: return 0;
+  }
+
+  return (int)(cur - format);
+}
+
+static NPF_NOINLINE int npf_utoa_rev(
+    npf_uint_t val, char *buf, uint_fast8_t base, char case_adj) {
+  uint_fast8_t n = 0;
+  do {
+    int_fast8_t const d = (int_fast8_t)(val % base);
+    *buf++ = (char)(((d < 10) ? '0' : ('A' - 10 + case_adj)) + d);
+    ++n;
+    val /= base;
+  } while (val);
+  return (int)n;
+}
+
+#if NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS == 1
+
+#include <float.h>
+
+#if (DBL_MANT_DIG <= 11) && (DBL_MAX_EXP <= 16)
+  typedef uint_fast16_t npf_double_bin_t;
+  typedef int_fast8_t npf_ftoa_exp_t;
+#elif (DBL_MANT_DIG <= 24) && (DBL_MAX_EXP <= 128)
+  typedef uint_fast32_t npf_double_bin_t;
+  typedef int_fast8_t npf_ftoa_exp_t;
+#elif (DBL_MANT_DIG <= 53) && (DBL_MAX_EXP <= 1024)
+  typedef uint_fast64_t npf_double_bin_t;
+  typedef int_fast16_t npf_ftoa_exp_t;
+#else
+  #error Unsupported width of the double type.
+#endif
+
+// The floating point conversion code works with an unsigned integer type of any size.
+#ifndef NANOPRINTF_CONVERSION_FLOAT_TYPE
+  #define NANOPRINTF_CONVERSION_FLOAT_TYPE unsigned int
+#endif
+typedef NANOPRINTF_CONVERSION_FLOAT_TYPE npf_ftoa_man_t;
+
+#if (NANOPRINTF_CONVERSION_BUFFER_SIZE <= UINT_FAST8_MAX) && (UINT_FAST8_MAX <= INT_MAX)
+  typedef uint_fast8_t npf_ftoa_dec_t;
+#else
+  typedef int npf_ftoa_dec_t;
+#endif
+
+enum {
+  NPF_DOUBLE_EXP_MASK = DBL_MAX_EXP * 2 - 1,
+  NPF_DOUBLE_EXP_BIAS = DBL_MAX_EXP - 1,
+  NPF_DOUBLE_MAN_BITS = DBL_MANT_DIG - 1,
+  NPF_DOUBLE_BIN_BITS = sizeof(npf_double_bin_t) * CHAR_BIT,
+  NPF_DOUBLE_SIGN_POS = sizeof(double) * CHAR_BIT - 1,
+  NPF_FTOA_MAN_BITS   = sizeof(npf_ftoa_man_t) * CHAR_BIT,
+  NPF_FTOA_SHIFT_BITS =
+    ((NPF_FTOA_MAN_BITS < DBL_MANT_DIG) ? NPF_FTOA_MAN_BITS : DBL_MANT_DIG) - 1
+};
+
+/* Generally, floating-point conversion implementations use
+   grisu2 (https://bit.ly/2JgMggX) and ryu (https://bit.ly/2RLXSg0) algorithms,
+   which are mathematically exact and fast, but require large lookup tables.
+
+   This implementation was inspired by Wojciech Muła's (zdjęcia@garnek.pl)
+   algorithm (http://0x80.pl/notesen/2015-12-29-float-to-string.html) and
+   extended further by adding dynamic scaling and configurable integer width by
+   Oskars Rubenis (https://github.com/Okarss). */
+
+static NPF_FORCE_INLINE npf_double_bin_t npf_double_to_int_rep(double f) {
+  // Union-cast is UB pre-C11 and in all C++; the compiler optimizes the code below.
+  npf_double_bin_t bin;
+  char const *src = (char const *)&f;
+  char *dst = (char *)&bin;
+  for (uint_fast8_t i = 0; i < sizeof(f); ++i) { dst[i] = src[i]; }
+  return bin;
+}
+
+static int npf_ftoa_rev(char *buf, npf_format_spec_t const *spec, double f) {
+  char const *ret = NULL;
+  npf_double_bin_t bin = npf_double_to_int_rep(f);
+
+  // Unsigned -> signed int casting is IB and can raise a signal but generally doesn't.
+  npf_ftoa_exp_t exp =
+    (npf_ftoa_exp_t)((npf_ftoa_exp_t)(bin >> NPF_DOUBLE_MAN_BITS) & NPF_DOUBLE_EXP_MASK);
+
+  bin &= ((npf_double_bin_t)0x1 << NPF_DOUBLE_MAN_BITS) - 1;
+  if (exp == (npf_ftoa_exp_t)NPF_DOUBLE_EXP_MASK) { // special value
+    ret = (bin) ? "NAN" : "FNI";
+    goto exit;
+  }
+  if (spec->prec > (NANOPRINTF_CONVERSION_BUFFER_SIZE - 2)) { goto exit; }
+  if (exp) { // normal number
+    bin |= (npf_double_bin_t)0x1 << NPF_DOUBLE_MAN_BITS;
+  } else { // subnormal number
+    ++exp;
+  }
+  exp = (npf_ftoa_exp_t)(exp - NPF_DOUBLE_EXP_BIAS);
+
+  uint_fast8_t carry; carry = 0;
+  npf_ftoa_dec_t end, dec; dec = (npf_ftoa_dec_t)spec->prec;
+  if (dec
+#if NANOPRINTF_USE_ALT_FORM_FLAG == 1
+      || spec->alt_form
+#endif
+  ) {
+    buf[dec++] = '.';
+  }
+
+  { // Integer part
+    npf_ftoa_man_t man_i;
+
+    if (exp >= 0) {
+      int_fast8_t shift_i =
+        (int_fast8_t)((exp > NPF_FTOA_SHIFT_BITS) ? (int)NPF_FTOA_SHIFT_BITS : exp);
+      npf_ftoa_exp_t exp_i = (npf_ftoa_exp_t)(exp - shift_i);
+      shift_i = (int_fast8_t)(NPF_DOUBLE_MAN_BITS - shift_i);
+      man_i = (npf_ftoa_man_t)(bin >> shift_i);
+
+      if (exp_i) {
+        if (shift_i) {
+          carry = (bin >> (shift_i - 1)) & 0x1;
+        }
+        exp = NPF_DOUBLE_MAN_BITS; // invalidate the fraction part
+      }
+
+      // Scale the exponent from base-2 to base-10.
+      for (; exp_i; --exp_i) {
+        if (!(man_i & ((npf_ftoa_man_t)0x1 << (NPF_FTOA_MAN_BITS - 1)))) {
+          man_i = (npf_ftoa_man_t)(man_i << 1);
+          man_i = (npf_ftoa_man_t)(man_i | carry); carry = 0;
+        } else {
+          if (dec >= NANOPRINTF_CONVERSION_BUFFER_SIZE) { goto exit; }
+          buf[dec++] = '0';
+          carry = (((uint_fast8_t)(man_i % 5) + carry) > 2);
+          man_i /= 5;
+        }
+      }
+    } else {
+      man_i = 0;
+    }
+    end = dec;
+
+    do { // Print the integer
+      if (end >= NANOPRINTF_CONVERSION_BUFFER_SIZE) { goto exit; }
+      buf[end++] = (char)('0' + (char)(man_i % 10));
+      man_i /= 10;
+    } while (man_i);
+  }
+
+  { // Fraction part
+    npf_ftoa_man_t man_f;
+    npf_ftoa_dec_t dec_f = (npf_ftoa_dec_t)spec->prec;
+
+    if (exp < NPF_DOUBLE_MAN_BITS) {
+      int_fast8_t shift_f = (int_fast8_t)((exp < 0) ? -1 : exp);
+      npf_ftoa_exp_t exp_f = (npf_ftoa_exp_t)(exp - shift_f);
+      npf_double_bin_t bin_f =
+        bin << ((NPF_DOUBLE_BIN_BITS - NPF_DOUBLE_MAN_BITS) + shift_f);
+
+      // This if-else statement can be completely optimized at compile time.
+      if (NPF_DOUBLE_BIN_BITS > NPF_FTOA_MAN_BITS) {
+        man_f = (npf_ftoa_man_t)(bin_f >> ((unsigned)(NPF_DOUBLE_BIN_BITS -
+                                                      NPF_FTOA_MAN_BITS) %
+                                           NPF_DOUBLE_BIN_BITS));
+        carry = (uint_fast8_t)((bin_f >> ((unsigned)(NPF_DOUBLE_BIN_BITS -
+                                                     NPF_FTOA_MAN_BITS - 1) %
+                                          NPF_DOUBLE_BIN_BITS)) & 0x1);
+      } else {
+        man_f = (npf_ftoa_man_t)((npf_ftoa_man_t)bin_f
+                                 << ((unsigned)(NPF_FTOA_MAN_BITS -
+                                                NPF_DOUBLE_BIN_BITS) % NPF_FTOA_MAN_BITS));
+        carry = 0;
+      }
+
+      // Scale the exponent from base-2 to base-10 and prepare the first digit.
+      for (uint_fast8_t digit = 0; dec_f && (exp_f < 4); ++exp_f) {
+        if ((man_f > ((npf_ftoa_man_t)-4 / 5)) || digit) {
+          carry = (uint_fast8_t)(man_f & 0x1);
+          man_f = (npf_ftoa_man_t)(man_f >> 1);
+        } else {
+          man_f = (npf_ftoa_man_t)(man_f * 5);
+          if (carry) { man_f = (npf_ftoa_man_t)(man_f + 3); carry = 0; }
+          if (exp_f < 0) {
+            buf[--dec_f] = '0';
+          } else {
+            ++digit;
+          }
+        }
+      }
+      man_f = (npf_ftoa_man_t)(man_f + carry);
+      carry = (exp_f >= 0);
+      dec = 0;
+    } else {
+      man_f = 0;
+    }
+
+    if (dec_f) {
+      // Print the fraction
+      for (;;) {
+        buf[--dec_f] = (char)('0' + (char)(man_f >> (NPF_FTOA_MAN_BITS - 4)));
+        man_f = (npf_ftoa_man_t)(man_f & ~((npf_ftoa_man_t)0xF << (NPF_FTOA_MAN_BITS - 4)));
+        if (!dec_f) { break; }
+        man_f = (npf_ftoa_man_t)(man_f * 10);
+      }
+      man_f = (npf_ftoa_man_t)(man_f << 4);
+    }
+    if (exp < NPF_DOUBLE_MAN_BITS) {
+      carry &= (uint_fast8_t)(man_f >> (NPF_FTOA_MAN_BITS - 1));
+    }
+  }
+
+  // Round the number
+  for (; carry; ++dec) {
+    if (dec >= NANOPRINTF_CONVERSION_BUFFER_SIZE) { goto exit; }
+    if (dec >= end) { buf[end++] = '0'; }
+    if (buf[dec] == '.') { continue; }
+    carry = (buf[dec] == '9');
+    buf[dec] = (char)(carry ? '0' : (buf[dec] + 1));
+  }
+
+  return (int)end;
+exit:
+  if (!ret) { ret = "RRE"; }
+  uint_fast8_t i;
+  for (i = 0; ret[i]; ++i) { buf[i] = (char)(ret[i] + spec->case_adjust); }
+  return -(int)i;
+}
+
+#endif // NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS
+
+#if NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS == 1
+static int npf_bin_len(npf_uint_t u) {
+  // Return the length of the binary string format of 'u', preferring intrinsics.
+  if (!u) { return 1; }
+
+#ifdef _MSC_VER // Win64, use _BSR64 for everything. If x86, use _BSR when non-large.
+  #ifdef _M_X64
+    #define NPF_HAVE_BUILTIN_CLZ
+    #define NPF_CLZ _BitScanReverse64
+  #elif NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 0
+    #define NPF_HAVE_BUILTIN_CLZ
+    #define NPF_CLZ _BitScanReverse
+  #endif
+  #ifdef NPF_HAVE_BUILTIN_CLZ
+    unsigned long idx;
+    NPF_CLZ(&idx, u);
+    return (int)(idx + 1);
+  #endif
+#elif NPF_CLANG || NPF_GCC_PAST_4_6
+  #define NPF_HAVE_BUILTIN_CLZ
+  #if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+    #define NPF_CLZ(X) ((sizeof(long long) * CHAR_BIT) - (size_t)__builtin_clzll(X))
+  #else
+    #define NPF_CLZ(X) ((sizeof(long) * CHAR_BIT) - (size_t)__builtin_clzl(X))
+  #endif
+  return (int)NPF_CLZ(u);
+#endif
+
+#ifndef NPF_HAVE_BUILTIN_CLZ
+  int n;
+  for (n = 0; u; ++n, u >>= 1); // slow but small software fallback
+  return n;
+#else
+  #undef NPF_HAVE_BUILTIN_CLZ
+  #undef NPF_CLZ
+#endif
+}
+#endif
+
+static void npf_bufputc(int c, void *ctx) {
+  npf_bufputc_ctx_t *bpc = (npf_bufputc_ctx_t *)ctx;
+  if (bpc->cur < bpc->len) { bpc->dst[bpc->cur++] = (char)c; }
+}
+
+static void npf_bufputc_nop(int c, void *ctx) { (void)c; (void)ctx; }
+
+typedef struct npf_cnt_putc_ctx {
+  npf_putc pc;
+  void *ctx;
+  int n;
+} npf_cnt_putc_ctx_t;
+
+static void npf_putc_cnt(int c, void *ctx) {
+  npf_cnt_putc_ctx_t *pc_cnt = (npf_cnt_putc_ctx_t *)ctx;
+  ++pc_cnt->n;
+  pc_cnt->pc(c, pc_cnt->ctx); // sibling-call optimization
+}
+
+#define NPF_PUTC(VAL) do { npf_putc_cnt((int)(VAL), &pc_cnt); } while (0)
+
+#define NPF_EXTRACT(MOD, CAST_TO, EXTRACT_AS) \
+  case NPF_FMT_SPEC_LEN_MOD_##MOD: val = (CAST_TO)va_arg(args, EXTRACT_AS); break
+
+#define NPF_WRITEBACK(MOD, TYPE) \
+  case NPF_FMT_SPEC_LEN_MOD_##MOD: *(va_arg(args, TYPE *)) = (TYPE)pc_cnt.n; break
+
+int npf_vpprintf(npf_putc pc, void *pc_ctx, char const *format, va_list args) {
+  npf_format_spec_t fs;
+  char const *cur = format;
+  npf_cnt_putc_ctx_t pc_cnt;
+  pc_cnt.pc = pc;
+  pc_cnt.ctx = pc_ctx;
+  pc_cnt.n = 0;
+
+  while (*cur) {
+    int const fs_len = (*cur != '%') ? 0 : npf_parse_format_spec(cur, &fs);
+    if (!fs_len) { NPF_PUTC(*cur++); continue; }
+    cur += fs_len;
+
+    // Extract star-args immediately
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+    if (fs.field_width_opt == NPF_FMT_SPEC_OPT_STAR) {
+      fs.field_width = va_arg(args, int);
+      if (fs.field_width < 0) {
+        fs.field_width = -fs.field_width;
+        fs.left_justified = 1;
+      }
+    }
+#endif
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+    if (fs.prec_opt == NPF_FMT_SPEC_OPT_STAR) {
+      fs.prec = va_arg(args, int);
+      if (fs.prec < 0) { fs.prec_opt = NPF_FMT_SPEC_OPT_NONE; }
+    }
+#endif
+
+    union { char cbuf_mem[NANOPRINTF_CONVERSION_BUFFER_SIZE]; npf_uint_t binval; } u;
+    char *cbuf = u.cbuf_mem, sign_c = 0;
+    int cbuf_len = 0;
+    char need_0x = 0;
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+    int field_pad = 0;
+    char pad_c = 0;
+#endif
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+    int prec_pad = 0;
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+    uint_fast8_t zero = 0;
+#endif
+#endif
+
+    // Extract and convert the argument to string, point cbuf at the text.
+    switch (fs.conv_spec) {
+      case NPF_FMT_SPEC_CONV_PERCENT:
+        *cbuf = '%';
+        cbuf_len = 1;
+        break;
+
+      case NPF_FMT_SPEC_CONV_CHAR:
+        *cbuf = (char)va_arg(args, int);
+        cbuf_len = (*cbuf) ? 1 : 0;
+        break;
+
+      case NPF_FMT_SPEC_CONV_STRING: {
+        cbuf = va_arg(args, char *);
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+        for (char const *s = cbuf;
+             ((fs.prec_opt == NPF_FMT_SPEC_OPT_NONE) || (cbuf_len < fs.prec)) && cbuf && *s;
+             ++s, ++cbuf_len);
+#else
+        for (char const *s = cbuf; cbuf && *s; ++s, ++cbuf_len); // strlen
+#endif
+      } break;
+
+      case NPF_FMT_SPEC_CONV_SIGNED_INT: {
+        npf_int_t val = 0;
+        switch (fs.length_modifier) {
+          NPF_EXTRACT(NONE, int, int);
+#if NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS == 1
+          NPF_EXTRACT(SHORT, short, int);
+          NPF_EXTRACT(CHAR, signed char, int);
+#endif
+          NPF_EXTRACT(LONG, long, long);
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+          NPF_EXTRACT(LARGE_LONG_LONG, long long, long long);
+          NPF_EXTRACT(LARGE_INTMAX, intmax_t, intmax_t);
+          NPF_EXTRACT(LARGE_SIZET, npf_ssize_t, npf_ssize_t);
+          NPF_EXTRACT(LARGE_PTRDIFFT, ptrdiff_t, ptrdiff_t);
+#endif
+          default: break;
+        }
+
+        sign_c = (val < 0) ? '-' : fs.prepend;
+
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+        zero = !val;
+#endif
+        // special case, if prec and value are 0, skip
+        if (!val && (fs.prec_opt != NPF_FMT_SPEC_OPT_NONE) && !fs.prec) {
+          cbuf_len = 0;
+        } else
+#endif
+        {
+          npf_uint_t uval = (npf_uint_t)val;
+          if (val < 0) { uval = 0 - uval; }
+          cbuf_len = npf_utoa_rev(uval, cbuf, 10, fs.case_adjust);
+        }
+      } break;
+
+#if NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS == 1
+      case NPF_FMT_SPEC_CONV_BINARY:
+#endif
+      case NPF_FMT_SPEC_CONV_OCTAL:
+      case NPF_FMT_SPEC_CONV_HEX_INT:
+      case NPF_FMT_SPEC_CONV_UNSIGNED_INT:
+      case NPF_FMT_SPEC_CONV_POINTER: {
+        npf_uint_t val = 0;
+
+        if (fs.conv_spec == NPF_FMT_SPEC_CONV_POINTER) {
+          val = (npf_uint_t)(uintptr_t)va_arg(args, void *);
+        } else {
+          switch (fs.length_modifier) {
+            NPF_EXTRACT(NONE, unsigned, unsigned);
+#if NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS == 1
+            NPF_EXTRACT(SHORT, unsigned short, unsigned);
+            NPF_EXTRACT(CHAR, unsigned char, unsigned);
+#endif
+            NPF_EXTRACT(LONG, unsigned long, unsigned long);
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+            NPF_EXTRACT(LARGE_LONG_LONG, unsigned long long, unsigned long long);
+            NPF_EXTRACT(LARGE_INTMAX, uintmax_t, uintmax_t);
+            NPF_EXTRACT(LARGE_SIZET, size_t, size_t);
+            NPF_EXTRACT(LARGE_PTRDIFFT, npf_uptrdiff_t, npf_uptrdiff_t);
+#endif
+            default: break;
+          }
+        }
+
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+        zero = !val;
+#endif
+        if (!val && (fs.prec_opt != NPF_FMT_SPEC_OPT_NONE) && !fs.prec) {
+          // Zero value and explicitly-requested zero precision means "print nothing".
+#if NANOPRINTF_USE_ALT_FORM_FLAG == 1
+          if ((fs.conv_spec == NPF_FMT_SPEC_CONV_OCTAL) && fs.alt_form) {
+            fs.prec = 1; // octal special case, print a single '0'
+          }
+#endif
+        } else
+#endif
+#if NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS == 1
+        if (fs.conv_spec == NPF_FMT_SPEC_CONV_BINARY) {
+          cbuf_len = npf_bin_len(val); u.binval = val;
+        } else
+#endif
+        {
+          uint_fast8_t const base = (fs.conv_spec == NPF_FMT_SPEC_CONV_OCTAL) ?
+            8u : ((fs.conv_spec == NPF_FMT_SPEC_CONV_UNSIGNED_INT) ? 10u : 16u);
+          cbuf_len = npf_utoa_rev(val, cbuf, base, fs.case_adjust);
+        }
+
+#if NANOPRINTF_USE_ALT_FORM_FLAG == 1
+        if (val && fs.alt_form && (fs.conv_spec == NPF_FMT_SPEC_CONV_OCTAL)) {
+          cbuf[cbuf_len++] = '0'; // OK to add leading octal '0' immediately.
+        }
+
+        if (val && fs.alt_form) { // 0x or 0b but can't write it yet.
+          if ((fs.conv_spec == NPF_FMT_SPEC_CONV_HEX_INT) ||
+              (fs.conv_spec == NPF_FMT_SPEC_CONV_POINTER)) { need_0x = 'X'; }
+#if NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS == 1
+          else if (fs.conv_spec == NPF_FMT_SPEC_CONV_BINARY) { need_0x = 'B'; }
+#endif
+          if (need_0x) { need_0x = (char)(need_0x + fs.case_adjust); }
+        }
+#endif
+      } break;
+
+#if NANOPRINTF_USE_WRITEBACK_FORMAT_SPECIFIERS == 1
+      case NPF_FMT_SPEC_CONV_WRITEBACK:
+        switch (fs.length_modifier) {
+          NPF_WRITEBACK(NONE, int);
+#if NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS == 1
+          NPF_WRITEBACK(SHORT, short);
+          NPF_WRITEBACK(CHAR, signed char);
+#endif
+          NPF_WRITEBACK(LONG, long);
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+          NPF_WRITEBACK(LARGE_LONG_LONG, long long);
+          NPF_WRITEBACK(LARGE_INTMAX, intmax_t);
+          NPF_WRITEBACK(LARGE_SIZET, npf_ssize_t);
+          NPF_WRITEBACK(LARGE_PTRDIFFT, ptrdiff_t);
+#endif
+          default: break;
+        } break;
+#endif
+
+#if NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS == 1
+      case NPF_FMT_SPEC_CONV_FLOAT_DEC:
+      case NPF_FMT_SPEC_CONV_FLOAT_SCI:
+      case NPF_FMT_SPEC_CONV_FLOAT_SHORTEST:
+      case NPF_FMT_SPEC_CONV_FLOAT_HEX: {
+        double val;
+        if (fs.length_modifier == NPF_FMT_SPEC_LEN_MOD_LONG_DOUBLE) {
+          val = (double)va_arg(args, long double);
+        } else {
+          val = va_arg(args, double);
+        }
+
+        sign_c = (npf_double_to_int_rep(val) >> NPF_DOUBLE_SIGN_POS) ? '-' : fs.prepend;
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+        zero = (val == 0.);
+#endif
+        cbuf_len = npf_ftoa_rev(cbuf, &fs, val);
+        if (cbuf_len < 0) { // negative means text (not number), so ignore the '0' flag
+           cbuf_len = -cbuf_len;
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+           fs.leading_zero_pad = 0;
+#endif
+        }
+      } break;
+#endif
+      default: break;
+    }
+
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+    // Compute the field width pad character
+    if (fs.field_width_opt != NPF_FMT_SPEC_OPT_NONE) {
+      if (fs.leading_zero_pad) {
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+        if ((fs.prec_opt != NPF_FMT_SPEC_OPT_NONE) && !fs.prec && zero) {
+          pad_c = ' ';
+        } else
+#endif
+        { pad_c = '0'; }
+      } else { pad_c = ' '; }
+    }
+#endif
+
+    // Compute the number of bytes to truncate or '0'-pad.
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+    if (fs.conv_spec != NPF_FMT_SPEC_CONV_STRING) {
+#if NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS == 1
+      // float precision is after the decimal point
+      if ((fs.conv_spec != NPF_FMT_SPEC_CONV_FLOAT_DEC) &&
+          (fs.conv_spec != NPF_FMT_SPEC_CONV_FLOAT_SCI) &&
+          (fs.conv_spec != NPF_FMT_SPEC_CONV_FLOAT_SHORTEST) &&
+          (fs.conv_spec != NPF_FMT_SPEC_CONV_FLOAT_HEX))
+#endif
+      { prec_pad = NPF_MAX(0, fs.prec - cbuf_len); }
+    }
+#endif
+
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+    // Given the full converted length, how many pad bytes?
+    field_pad = fs.field_width - cbuf_len - !!sign_c;
+    if (need_0x) { field_pad -= 2; }
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+    field_pad -= prec_pad;
+#endif
+    field_pad = NPF_MAX(0, field_pad);
+
+    // Apply right-justified field width if requested
+    if (!fs.left_justified && pad_c) { // If leading zeros pad, sign goes first.
+      if (pad_c == '0') {
+        if (sign_c) { NPF_PUTC(sign_c); sign_c = 0; }
+        // Pad byte is '0', write '0x' before '0' pad chars.
+        if (need_0x) { NPF_PUTC('0'); NPF_PUTC(need_0x); }
+      }
+      while (field_pad-- > 0) { NPF_PUTC(pad_c); }
+      // Pad byte is ' ', write '0x' after ' ' pad chars but before number.
+      if ((pad_c != '0') && need_0x) { NPF_PUTC('0'); NPF_PUTC(need_0x); }
+    } else
+#endif
+    { if (need_0x) { NPF_PUTC('0'); NPF_PUTC(need_0x); } } // no pad, '0x' requested.
+
+    // Write the converted payload
+    if (fs.conv_spec == NPF_FMT_SPEC_CONV_STRING) {
+      for (int i = 0; cbuf && (i < cbuf_len); ++i) { NPF_PUTC(cbuf[i]); }
+    } else {
+      if (sign_c) { NPF_PUTC(sign_c); }
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+      while (prec_pad-- > 0) { NPF_PUTC('0'); } // int precision leads.
+#endif
+#if NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS == 1
+      if (fs.conv_spec == NPF_FMT_SPEC_CONV_BINARY) {
+        while (cbuf_len) { NPF_PUTC('0' + ((u.binval >> --cbuf_len) & 1)); }
+      } else
+#endif
+      { while (cbuf_len-- > 0) { NPF_PUTC(cbuf[cbuf_len]); } } // payload is reversed
+    }
+
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+    if (fs.left_justified && pad_c) { // Apply left-justified field width
+      while (field_pad-- > 0) { NPF_PUTC(pad_c); }
+    }
+#endif
+  }
+
+  return pc_cnt.n;
+}
+
+#undef NPF_PUTC
+#undef NPF_EXTRACT
+#undef NPF_WRITEBACK
+
+int npf_pprintf(npf_putc pc,
+                void * NPF_RESTRICT pc_ctx,
+                char const * NPF_RESTRICT format,
+                ...) {
+  va_list val;
+  va_start(val, format);
+  int const rv = npf_vpprintf(pc, pc_ctx, format, val);
+  va_end(val);
+  return rv;
+}
+
+int npf_snprintf(char * NPF_RESTRICT buffer,
+                 size_t bufsz,
+                 const char * NPF_RESTRICT format,
+                 ...) {
+  va_list val;
+  va_start(val, format);
+  int const rv = npf_vsnprintf(buffer, bufsz, format, val);
+  va_end(val);
+  return rv;
+}
+
+int npf_vsnprintf(char * NPF_RESTRICT buffer,
+                  size_t bufsz,
+                  char const * NPF_RESTRICT format,
+                  va_list vlist) {
+  npf_bufputc_ctx_t bufputc_ctx;
+  bufputc_ctx.dst = buffer;
+  bufputc_ctx.len = bufsz;
+  bufputc_ctx.cur = 0;
+
+  npf_putc const pc = buffer ? npf_bufputc : npf_bufputc_nop;
+  int const n = npf_vpprintf(pc, &bufputc_ctx, format, vlist);
+
+  if (buffer && bufsz) {
+#ifdef NANOPRINTF_SNPRINTF_SAFE_EMPTY_STRING_ON_OVERFLOW
+    buffer[(n < 0 || (unsigned)n >= bufsz) ? 0 : n] = '\0';
+#else
+    buffer[n < 0 ? 0 : NPF_MIN((unsigned)n, bufsz - 1)] = '\0';
+#endif
+  }
+
+  return n;
+}
+
+#if NPF_HAVE_GCC_WARNING_PRAGMAS
+  #pragma GCC diagnostic pop
+#endif
+
+#ifdef _MSC_VER
+  #pragma warning(pop)
+#endif
+
+#endif // NPF_IMPLEMENTATION_INCLUDED
+#endif // NANOPRINTF_IMPLEMENTATION
+
+/*
+  nanoprintf is dual-licensed under both the "Unlicense" and the
+  "Zero-Clause BSD" (0BSD) licenses. The intent of this dual-licensing
+  structure is to make nanoprintf as consumable as possible in as many
+  environments / countries / companies as possible without any
+  encumberances.
+
+  The text of the two licenses follows below:
+
+  ============================== UNLICENSE ==============================
+
+  This is free and unencumbered software released into the public domain.
+
+  Anyone is free to copy, modify, publish, use, compile, sell, or
+  distribute this software, either in source code form or as a compiled
+  binary, for any purpose, commercial or non-commercial, and by any
+  means.
+
+  In jurisdictions that recognize copyright laws, the author or authors
+  of this software dedicate any and all copyright interest in the
+  software to the public domain. We make this dedication for the benefit
+  of the public at large and to the detriment of our heirs and
+  successors. We intend this dedication to be an overt act of
+  relinquishment in perpetuity of all present and future rights to this
+  software under copyright law.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+  IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+  OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+  ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+  OTHER DEALINGS IN THE SOFTWARE.
+
+  For more information, please refer to <http://unlicense.org>
+
+  ================================ 0BSD =================================
+
+  Copyright (C) 2019- by Charles Nicholson <charles.nicholson+nanoprintf@gmail.com>
+
+  Permission to use, copy, modify, and/or distribute this software for
+  any purpose with or without fee is hereby granted.
+
+  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
diff --git a/target/linux/realtek/image/rt-loader/linker/linker.ld b/target/linux/realtek/image/rt-loader/linker/linker.ld
new file mode 100644
index 00000000000..dd1fb5aaa00
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/linker/linker.ld
@@ -0,0 +1,41 @@
+ENTRY(_start)
+
+SECTIONS {
+	.text : {
+		*(.text)
+	}
+
+	.data : ALIGN(32) {
+		*(.sdata*)
+		*(.data*)
+	}
+/*
+ * In MIPS position independent code (PIC), the global offset table (GOT) is a data structure
+ * used to facilitate access to global variables and functions when the code's final memory
+ * location is not known at compile time. The GOT contains absolute addresses of global symbols,
+ * but is itself located using a relative reference. This allows the code to be relocated at
+ * runtime without modification.
+ */
+	.got : ALIGN(32) {
+		__got_start = .;
+		*(.got*)
+		__got_end = .;
+	}
+/*
+ * Storage for the compressed kernel image that was integrated into the loader during link time.
+ * No code just binary data.
+ */
+	.kernel : ALIGN(1) {
+		__kernel_data_start = .;
+		KEEP(*(.kernel))
+		__kernel_data_end = .;
+	}
+
+	.bss (NOLOAD) : ALIGN(4) {
+		__bss_start = .;
+		*(.bss)
+		*(.sbss)
+		*(COMMON)
+		__bss_end = .;
+	}
+}
diff --git a/target/linux/realtek/image/rt-loader/src/board.c b/target/linux/realtek/image/rt-loader/src/board.c
new file mode 100644
index 00000000000..d6d5865673d
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/src/board.c
@@ -0,0 +1,110 @@
+/*
+ * rt-loader board functions
+ * (c) 2025 Markus Stockhausen
+ */
+
+#include "globals.h"
+#include "memory.h"
+#include "nanoprintf.h"
+
+#define DRAM_CONFIG_REG			0xb8001004
+#define UART_BUFFER_REG			0xb8002000
+#define UART_LINE_STATUS_REG		0xb8002014
+#define UART_TX_READY			(1 << 29)
+
+#define RTL838X_ENABLE_RW_MASK		0x3
+#define RTL838X_INT_RW_CTRL_REG		0xbb000058
+#define RTL838X_MODEL_NAME_INFO_REG	0xbb0000d4
+#define RTL839X_MODEL_NAME_INFO_REG	0xbb000ff0
+#define RTL83XX_CHIP_INFO_EN		0xa0000000
+#define RTL93XX_MODEL_NAME_INFO_REG	0xbb000004
+#define RTL93XX_CHIP_INFO_EN		0xa0000
+
+/*
+ * board_putchar() is the central function to write to serial console of the device. Some printf
+ * libraries (e.g. https://github.com/mpaland/printf) need a fixed function name like _putchar.
+ * To keep the original library as is, link the two functions with gcc compiler option
+ * -D_putchar=board_putchar
+ */
+
+void board_putchar(int ch, void *ctx)
+{
+	while (!(ioread32(UART_LINE_STATUS_REG) & UART_TX_READY));
+	iowrite32(((int)ch) << 24, UART_BUFFER_REG);
+
+	if (ch == '\n')
+		board_putchar('\r', ctx);
+}
+
+/*
+ * board_get_memory() does what it is named after. On Realtek switches the DRAM config register
+ * has information about bank count, bus width, ... From that the memory size can be derived.
+ */
+
+unsigned int board_get_memory(void)
+{
+	unsigned int dcr = ioread32(DRAM_CONFIG_REG);
+	char ROWCNTv[] = {11, 12, 13, 14, 15, 16};
+	char COLCNTv[] = {8, 9, 10, 11, 12};
+	char BNKCNTv[] = {1, 2, 3};
+	char BUSWIDv[] = {0, 1, 2};
+
+	return 1 << (BNKCNTv[(dcr >> 28) & 0x3] + BUSWIDv[(dcr >> 24) & 0x3] +
+		     ROWCNTv[(dcr >> 20) & 0xf] + COLCNTv[(dcr >> 16) & 0xf]);
+}
+
+/*
+ * board_get_system() generates a readable system name that will be printed during startup.
+ * Formatting can be whatever is helpful.
+ */
+
+void board_get_system(char *buffer, int len)
+{
+	unsigned int chip_id, model_id, model_version, chip_version;
+	unsigned int reg, val, act;
+
+	act = RTL93XX_CHIP_INFO_EN;
+	reg = RTL93XX_MODEL_NAME_INFO_REG;
+	val = ioread32(reg);
+
+	if ((val & 0xffec0000) == 0x93000000)
+		goto found;
+
+	act = RTL83XX_CHIP_INFO_EN;
+	reg = RTL839X_MODEL_NAME_INFO_REG;
+	val = ioread32(reg);
+	if ((val & 0xfff80000) == 0x83900000)
+		goto found;
+
+	iowrite32(0x3, RTL838X_INT_RW_CTRL_REG);
+	reg = RTL838X_MODEL_NAME_INFO_REG;
+	val = ioread32(reg);
+found:
+	model_id = val >> 16;
+	model_version = (val >> 11) & 0x1f;
+
+	iowrite32(act, reg + 4);
+	val = ioread32(reg + 4);
+	chip_id = val & 0xffff;
+
+	if (model_id < 0x9300)
+		chip_version = val >> 16 & 0x1f;
+	else
+		chip_version = val >> 28 & 0x0f;
+
+	snprintf(buffer, len, "RTL%04X%c (chip id %04x%c)",
+		 model_id, model_version ? model_version + 64 : 0,
+		 chip_id, chip_version ? chip_version + 64 : 0);
+}
+
+/*
+ * board_panic() is called in critical cases. Whatever is needed can be done here. Maybe
+ * an automatic reboot can be issued some day. For now just halt processing.
+ */
+
+void board_panic(void)
+{
+	printf("halt system\n");
+	while (1) {
+	}
+}
diff --git a/target/linux/realtek/image/rt-loader/src/main.c b/target/linux/realtek/image/rt-loader/src/main.c
new file mode 100644
index 00000000000..747881ea067
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/src/main.c
@@ -0,0 +1,123 @@
+/*
+ * rt-loader main program
+ * (c) 2025 Markus Stockhausen
+ *
+ * This code was inspired by the OpenWrt lzma loader. Thanks to
+ *
+ * Copyright (C) 2004 Manuel Novoa III (mjn3@codepoet.org)
+ * Copyright (C) 2005 Mineharu Takahara <mtakahar@yahoo.com>
+ * Copyright (C) 2005 by Oleg I. Vdovikin <oleg@cs.msu.su>
+ * Copyright (C) 2011 Gabor Juhos <juhosg@openwrt.org>
+ */
+
+#include "board.h"
+#include "globals.h"
+#include "memory.h"
+
+#define NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS	1
+#define NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS		0
+#define NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS		0
+#define NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS		0
+#define NANOPRINTF_USE_WRITEBACK_FORMAT_SPECIFIERS	0
+#define NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS	0
+#define NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS		0
+#define NANOPRINTF_IMPLEMENTATION
+#include "nanoprintf.h"
+
+extern void *_kernel_load_addr;
+extern void *_kernel_data_addr;
+extern int _kernel_data_size;
+extern void *_my_load_addr;
+extern int _my_load_size;
+
+extern int unlzma(unsigned char *buf, long in_len,
+	   long (*fill)(void*, unsigned long),
+	   long (*flush)(void*, unsigned long),
+	   unsigned char *output,
+	   long *outlen,
+	   long *posp,
+	   void(*error)(char *x));
+
+typedef void (*entry_func_t)(unsigned long reg_a0, unsigned long reg_a1,
+			     unsigned long reg_a2, unsigned long reg_a3);
+
+void *relocate(void *src, int len)
+{
+	void *addr;
+	unsigned int offs;
+
+	/*
+	 * Relocate to highest possible memory address. This is usually the RAM size minus some
+	 * space for the heap and the stack pointer. As we do not have any highmem features
+	 * limit this to 256MB.
+	 */
+
+	offs = (board_get_memory() - STACK_SIZE - HEAP_SIZE - len - 1024) & 0xfff0000;
+	addr = (void *)KSEG0 + offs;
+
+	printf("Relocate %d bytes from 0x%08x to 0x%08x\n", len, src, addr);
+
+	memcpy(addr, src, len);
+	flush_cache(addr, len);
+
+	return addr;
+}
+
+void welcome(void)
+{
+	char system[80];
+
+	board_get_system(system, sizeof(system));
+
+	printf("rt-loader\n");
+	printf("Running on %s with %dMB\n", system, board_get_memory() >> 20);
+}
+
+void decompress_error(char *x)
+{
+	printf("%s\n", x);
+}
+
+void *decompress(void *out, void *in, int len)
+{
+	long outlen;
+
+	printf("Extract kernel with %d bytes from 0x%08x to 0x%08x ...\n", len, in, out);
+
+	if (unlzma(in, len, 0, 0, out, &outlen, 0, decompress_error))
+		board_panic();
+
+	printf("Extracted kernel size is %d bytes\n", outlen);
+	flush_cache(out, outlen);
+
+	return out;
+}
+
+void main(unsigned long reg_a0, unsigned long reg_a1,
+	  unsigned long reg_a2, unsigned long reg_a3)
+{
+	entry_func_t fn;
+
+	if (_kernel_load_addr == _my_load_addr) {
+		/*
+		 * During first run relocate the whole package to the end of memory. Use
+		 * _my_load_size as relocation length. That includes the bss section, aka
+		 * uninitialized globals. So it is possible to initialize globals during
+		 * first run and have them at hand after relocation.
+		 */
+
+		welcome();
+		fn = relocate(_my_load_addr, _my_load_size);
+		fn(reg_a0, reg_a1, reg_a2, reg_a3);
+	} else {
+		/*
+		 * During second run extract the attached kernel image to the memory address
+		 * that the loader was loaded to in the first run.
+		 */
+
+		fn = decompress(_kernel_load_addr, _kernel_data_addr, _kernel_data_size);
+
+		printf("Booting kernel from 0x%08x ...\n\n", fn);
+		fn(reg_a0, reg_a1, reg_a2, reg_a3);
+	}
+}
diff --git a/target/linux/realtek/image/rt-loader/src/memory.c b/target/linux/realtek/image/rt-loader/src/memory.c
new file mode 100644
index 00000000000..6ff5a448974
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/src/memory.c
@@ -0,0 +1,122 @@
+/*
+ * rt-loader memory functions
+ * (c) 2025 Markus Stockhausen
+ *
+ * This is a small function collection to get some rudimentary memory management working when
+ * running bare metal. None of these functions is optimized but works well for current needs.
+ */
+
+#include "board.h"
+#include "globals.h"
+#include "memory.h"
+#include "nanoprintf.h"
+
+#define CACHE_OP(op, addr)			\
+	__asm__ __volatile__(			\
+	"	.set	push		\n"	\
+	"	.set	noreorder	\n"	\
+	"	.set	mips3\n\t	\n"	\
+	"	cache	%0, %1		\n"	\
+	"	.set	pop		\n"	\
+	:					\
+	: "i" (op), "R" (*(unsigned char *)(addr)))
+
+void flush_cache(void *start_addr, unsigned long size)
+{
+	/*
+	 * MIPS cores may have different cache lines. Most common are 16 and 32 bytes. Avoid
+	 * detection routines or multiple implementations and take the lowest known value that
+	 * will fit fine for cores with longer cache lines
+	 */
+
+	unsigned long lsize = 16;
+	unsigned long addr = (unsigned long)start_addr & ~(lsize - 1);
+	unsigned long aend = ((unsigned long)start_addr + size - 1) & ~(lsize - 1);
+
+	while (1) {
+		CACHE_OP(CACHE_HIT_INVALIDATE_I, addr);
+		CACHE_OP(CACHE_HIT_WRITEBACK_INV_D, addr);
+		if (addr == aend)
+			break;
+		addr += lsize;
+	}
+}
+
+void free(void *ptr)
+{
+	/* this is only one shot allocation */
+}
+
+int memcmp(const void *s1, const void *s2, size_t count)
+{
+	volatile char *p1 = (volatile char *)s1;
+	volatile char *p2 = (volatile char *)s2;
+
+	while (count--) {
+		if (*p1 != *p2)
+			return (int)(*p1) - (int)(*p2);
+
+		p1++;
+		p2++;
+	}
+
+	return 0;
+}
+
+void *memmove(void *dst, const void *src, size_t count)
+{
+	volatile char *d = (volatile char *)dst;
+	volatile char *s = (volatile char *)src;
+
+	if (d < s) {
+		while (count--)
+			*d++ = *s++;
+	} else if (d > s) {
+		d += count;
+		s += count;
+		while (count--)
+			*--d = *--s;
+	}
+
+	return dst;
+}
+
+void *memcpy(void *dst, const void *src, size_t count)
+{
+	memmove(dst, src, count);
+}
+
+void *memset(void *dst, int c, size_t count)
+{
+	volatile char *d = (volatile char *)dst;
+
+	while (count--)
+		*d++ = c;
+
+	return (void *)d;
+}
+
+void *malloc(size_t size)
+{
+	void *start;
+
+	start = (void *)(((unsigned int)_heap_addr + MEMORY_ALIGNMENT - 1) & ~(MEMORY_ALIGNMENT - 1));
+	if ((start + size) > _heap_addr_max) {
+		printf("malloc(%d) failed. Only %dkB of %dkB heap left.\n",
+		       size, (_heap_addr_max - start) >> 10, HEAP_SIZE >> 10);
+		board_panic();
+	}
+
+	_heap_addr += size;
+
+	return start;
+}
+
+size_t strlen(const char *s)
+{
+	const char *p = s;
+
+	while (*p) ++p;
+
+	return (size_t)(p - s);
+}
diff --git a/target/linux/realtek/image/rt-loader/src/startup.S b/target/linux/realtek/image/rt-loader/src/startup.S
new file mode 100644
index 00000000000..898f7e1a163
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/src/startup.S
@@ -0,0 +1,182 @@
+# rt-loader assembler startup code
+# (c) 2025 Markus Stockhausen
+
+#include "globals.h"
+
+# This start code allows to run a position independent code (PIC) on bare metal. In that case
+# all addresses are looked up via the global offset table (GOT). But that must be filled during
+# this initialization sequence. Without a proper GOT using standard "la" instruction in the code
+# will not work. Provide a macro that avoids the dependency.
+
+.macro _LA reg, symbol
+	lui \reg, %hi(\symbol)
+	addi \reg, \reg, %lo(\symbol)
+	add \reg, $t9
+.endm
+
+	.section .text
+	.globl _start
+	.ent _start
+_start:
+	.set noreorder
+
+# Determine current program load address and store it into t9.
+
+	bal	_where_am_i
+	nop
+_where_am_i:
+	move	$t9, $ra
+	subu	$t9, $t9, 0x8
+
+
+# Check if this our first run (_kernel_load_addr = 0?)
+
+	_LA	$t6, _kernel_load_addr
+	lw	$t7, 0($t6)
+	bne	$zero, $t7, _init_done
+	nop
+
+# During first run store the current load address as the target kernel load address.
+
+	sw	$t9, 0($t6)
+
+# Same for the global variables in the BSS section. Clear them only during the first run. This
+# way the "global program state" can be copied over to the relocation address.
+
+	_LA	$t3, __bss_start
+	_LA	$t4, __bss_end
+_bss_zero:
+	beq	$t3, $t4, _init_done
+	nop
+	sw	$zero, 0($t3)
+	addiu	$t3, $t3, 4
+	b	_bss_zero
+	nop
+
+_init_done:
+
+# Code is running bare metal and no one initializes the global offset table. After the build
+# process the table is relative to address 0x0. Starting from anywhere else breaks the program.
+# A manual update is required during startup. Usually this is quite easy by simply adding the
+# current load address to all entries.
+# But this code relocates itself to another memory address and starts itself over. At the new
+# address it will find a global offset table that fits to the previous execution. To solve this
+# store a copy of the last load address in got_delta variable and only add the difference after
+# a relocation. Sequence is as follows
+#
+# - U-Boot loads the code to 0x80100000
+# - U-Boot runs the code at 0x80100000
+# - code identifies its dynamic start_address = 0x80100000
+# - code reads (initial) _got_delta = 0x00000000
+# - code adds 0x80100000 to all GOT entries
+# - code stores _got_delta with 0x80100000
+# - code copies itself over to a new location 0x85000000
+# - code starts itself from 0x85000000
+# - code identifies its dynamic start_address = 0x85000000
+# - code reads (pre-filled) _got_delta = 0x80100000
+# - code adds 0x4f00000 (= 0x85000000 - 0x80100000) to all GOT entries
+# - ...
+#
+
+	_LA	$t6, _got_delta
+	lw	$t5, 0($t6)
+	subu    $t7, $t9, $t5
+	sw	$t9, 0($t6)
+	_LA	$t3, __got_start
+	_LA	$t4, __got_end
+_got_patch:
+	beq	$t3, $t4, _got_done
+	nop
+	lw	$t5, 0($t3)
+	addu	$t5, $t5, $t7
+	sw	$t5, 0($t3)
+	addiu	$t3, $t3, 4
+	b	_got_patch
+	nop
+_got_done:
+
+# Linker attached kernel to end of package. Store addresses in global variables
+
+	_LA	$t8, _my_load_addr
+	sw	$t9, 0($t8)
+
+	_LA	$t5, __kernel_data_start
+	_LA	$t4, _kernel_data_addr
+	sw	$t5, 0($t4)
+
+	_LA	$t3, __kernel_data_end
+	subu	$t3, $t3, $t5
+	_LA	$t4, _kernel_data_size
+	sw	$t3, 0($t4)
+
+# Determine own code size by looking where BSS ends.
+
+	_LA	$t3, __bss_end
+	subu	$t6, $t3, $t9
+	_LA	$t4, _my_load_size
+	sw	$t6, 0($t4)
+
+# Setup heap. It will start directly behind BSS
+
+	addiu 	$t3, MEMORY_ALIGNMENT
+	li	$t4, ~(MEMORY_ALIGNMENT - 1)
+	and	$t3, $t4
+
+	_LA	$t5, _heap_addr
+	sw	$t3, 0($t5)
+
+	li	$t4, HEAP_SIZE
+	add	$t3, $t4
+
+	_LA	$t5, _heap_addr_max
+	sw	$t3, 0($t5)
+
+# Setup stack that is located on top of heap.
+
+	li	$t4, STACK_SIZE
+	add	$sp, $t3, $t4
+
+# Adapt t9 so it points to main(). This is needed so main() can find the GOT via t9/gp
+
+	_LA     $t8, main
+	move	$t9, $t8
+
+# Call main() with parameters a0, a3, __kernel_start, __kernel_end
+	bal	main
+	nop
+
+	.end _start
+
+	.section .data
+	.align 4
+# delta for global offset table initialization
+_got_delta:
+	.word 0
+# current heap address for malloc() / free()
+	.globl _heap_addr
+_heap_addr:
+	.word 0
+# maximum heap address
+	.globl _heap_addr_max
+_heap_addr_max:
+	.word 0
+# current program load address
+	.globl _my_load_addr
+_my_load_addr:
+	.word 0
+# total size of code including attached kernel and bss (uninitialized global variables)
+	.globl _my_load_size
+_my_load_size:
+	.word 0
+# target load address of kernel = this programs address during initial run
+	.globl _kernel_load_addr
+_kernel_load_addr:
+	.word 0
+# absolute start address of attached kernel
+	.globl _kernel_data_addr
+_kernel_data_addr:
+	.word 0
+# size of attached kernel
+	.globl _kernel_data_size
+_kernel_data_size:
+	.word 0
diff --git a/target/linux/realtek/image/rt-loader/src/unlzma.c b/target/linux/realtek/image/rt-loader/src/unlzma.c
new file mode 100644
index 00000000000..a7ddc004ea9
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/src/unlzma.c
@@ -0,0 +1,663 @@
+/* Lzma decompressor for Linux kernel. Shamelessly snarfed
+ *from busybox 1.1.1
+ *
+ *Linux kernel adaptation
+ *Copyright (C) 2006  Alain < alain@knaff.lu >
+ *
+ *Based on small lzma deflate implementation/Small range coder
+ *implementation for lzma.
+ *Copyright (C) 2006  Aurelien Jacobs < aurel@gnuage.org >
+ *
+ *Based on LzmaDecode.c from the LZMA SDK 4.22 (https://www.7-zip.org/)
+ *Copyright (C) 1999-2005  Igor Pavlov
+ *
+ *Copyrights of the parts, see headers below.
+ *
+ *
+ *This program is free software; you can redistribute it and/or
+ *modify it under the terms of the GNU Lesser General Public
+ *License as published by the Free Software Foundation; either
+ *version 2.1 of the License, or (at your option) any later version.
+ *
+ *This program is distributed in the hope that it will be useful,
+ *but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *Lesser General Public License for more details.
+ *
+ *You should have received a copy of the GNU Lesser General Public
+ *License along with this library; if not, write to the Free Software
+ *Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef MIN
+#define	MIN(a, b) (((a) < (b)) ? (a) : (b))
+#endif
+
+static long long read_int(unsigned char *ptr, int size)
+{
+	int i;
+	long long ret = 0;
+
+	for (i = 0; i < size; i++)
+		ret = (ret << 8) | ptr[size-i-1];
+	return ret;
+}
+
+#define ENDIAN_CONVERT(x) \
+  x = (typeof(x))read_int((unsigned char *)&x, sizeof(x))
+
+
+/* Small range coder implementation for lzma.
+ *Copyright (C) 2006  Aurelien Jacobs < aurel@gnuage.org >
+ *
+ *Based on LzmaDecode.c from the LZMA SDK 4.22 (https://www.7-zip.org/)
+ *Copyright (c) 1999-2005  Igor Pavlov
+ */
+
+#include "memory.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define LZMA_IOBUF_SIZE	0x10000
+
+struct rc {
+	long (*fill)(void*, unsigned long);
+	uint8_t *ptr;
+	uint8_t *buffer;
+	uint8_t *buffer_end;
+	long buffer_size;
+	uint32_t code;
+	uint32_t range;
+	uint32_t bound;
+	void (*error)(char *);
+};
+
+
+#define RC_TOP_BITS 24
+#define RC_MOVE_BITS 5
+#define RC_MODEL_TOTAL_BITS 11
+
+
+static long nofill(void *buffer, unsigned long len)
+{
+	return -1;
+}
+
+/* Called twice: once at startup and once in rc_normalize() */
+static void rc_read(struct rc *rc)
+{
+	rc->buffer_size = rc->fill((char *)rc->buffer, LZMA_IOBUF_SIZE);
+	if (rc->buffer_size <= 0)
+		rc->error("unexpected EOF");
+	rc->ptr = rc->buffer;
+	rc->buffer_end = rc->buffer + rc->buffer_size;
+}
+
+/* Called once */
+static inline void rc_init(struct rc *rc,
+			   long (*fill)(void*, unsigned long),
+			   char *buffer, long buffer_size)
+{
+	if (fill)
+		rc->fill = fill;
+	else
+		rc->fill = nofill;
+	rc->buffer = (uint8_t *)buffer;
+	rc->buffer_size = buffer_size;
+	rc->buffer_end = rc->buffer + rc->buffer_size;
+	rc->ptr = rc->buffer;
+
+	rc->code = 0;
+	rc->range = 0xFFFFFFFF;
+}
+
+static inline void rc_init_code(struct rc *rc)
+{
+	int i;
+
+	for (i = 0; i < 5; i++) {
+		if (rc->ptr >= rc->buffer_end)
+			rc_read(rc);
+		rc->code = (rc->code << 8) | *rc->ptr++;
+	}
+}
+
+
+/* Called twice, but one callsite is in inline'd rc_is_bit_0_helper() */
+static void rc_do_normalize(struct rc *rc)
+{
+	if (rc->ptr >= rc->buffer_end)
+		rc_read(rc);
+	rc->range <<= 8;
+	rc->code = (rc->code << 8) | *rc->ptr++;
+}
+static inline void rc_normalize(struct rc *rc)
+{
+	if (rc->range < (1 << RC_TOP_BITS))
+		rc_do_normalize(rc);
+}
+
+/* Called 9 times */
+/* Why rc_is_bit_0_helper exists?
+ *Because we want to always expose (rc->code < rc->bound) to optimizer
+ */
+static inline uint32_t rc_is_bit_0_helper(struct rc *rc, uint16_t *p)
+{
+	rc_normalize(rc);
+	rc->bound = *p * (rc->range >> RC_MODEL_TOTAL_BITS);
+	return rc->bound;
+}
+static inline int rc_is_bit_0(struct rc *rc, uint16_t *p)
+{
+	uint32_t t = rc_is_bit_0_helper(rc, p);
+	return rc->code < t;
+}
+
+/* Called ~10 times, but very small, thus inlined */
+static inline void rc_update_bit_0(struct rc *rc, uint16_t *p)
+{
+	rc->range = rc->bound;
+	*p += ((1 << RC_MODEL_TOTAL_BITS) - *p) >> RC_MOVE_BITS;
+}
+static inline void rc_update_bit_1(struct rc *rc, uint16_t *p)
+{
+	rc->range -= rc->bound;
+	rc->code -= rc->bound;
+	*p -= *p >> RC_MOVE_BITS;
+}
+
+/* Called 4 times in unlzma loop */
+static int rc_get_bit(struct rc *rc, uint16_t *p, int *symbol)
+{
+	if (rc_is_bit_0(rc, p)) {
+		rc_update_bit_0(rc, p);
+		*symbol *= 2;
+		return 0;
+	} else {
+		rc_update_bit_1(rc, p);
+		*symbol = *symbol * 2 + 1;
+		return 1;
+	}
+}
+
+/* Called once */
+static inline int rc_direct_bit(struct rc *rc)
+{
+	rc_normalize(rc);
+	rc->range >>= 1;
+	if (rc->code >= rc->range) {
+		rc->code -= rc->range;
+		return 1;
+	}
+	return 0;
+}
+
+/* Called twice */
+static inline void rc_bit_tree_decode(struct rc *rc, uint16_t *p, int num_levels, int *symbol)
+{
+	int i = num_levels;
+
+	*symbol = 1;
+	while (i--)
+		rc_get_bit(rc, p + *symbol, symbol);
+	*symbol -= 1 << num_levels;
+}
+
+
+/*
+ * Small lzma deflate implementation.
+ * Copyright (C) 2006  Aurelien Jacobs < aurel@gnuage.org >
+ *
+ * Based on LzmaDecode.c from the LZMA SDK 4.22 (https://www.7-zip.org/)
+ * Copyright (C) 1999-2005  Igor Pavlov
+ */
+
+
+struct lzma_header {
+	uint8_t pos;
+	uint32_t dict_size;
+	uint64_t dst_size;
+} __attribute__ ((packed)) ;
+
+
+#define LZMA_BASE_SIZE 1846
+#define LZMA_LIT_SIZE 768
+
+#define LZMA_NUM_POS_BITS_MAX 4
+
+#define LZMA_LEN_NUM_LOW_BITS 3
+#define LZMA_LEN_NUM_MID_BITS 3
+#define LZMA_LEN_NUM_HIGH_BITS 8
+
+#define LZMA_LEN_CHOICE 0
+#define LZMA_LEN_CHOICE_2 (LZMA_LEN_CHOICE + 1)
+#define LZMA_LEN_LOW (LZMA_LEN_CHOICE_2 + 1)
+#define LZMA_LEN_MID (LZMA_LEN_LOW \
+		      + (1 << (LZMA_NUM_POS_BITS_MAX + LZMA_LEN_NUM_LOW_BITS)))
+#define LZMA_LEN_HIGH (LZMA_LEN_MID \
+		       +(1 << (LZMA_NUM_POS_BITS_MAX + LZMA_LEN_NUM_MID_BITS)))
+#define LZMA_NUM_LEN_PROBS (LZMA_LEN_HIGH + (1 << LZMA_LEN_NUM_HIGH_BITS))
+
+#define LZMA_NUM_STATES 12
+#define LZMA_NUM_LIT_STATES 7
+
+#define LZMA_START_POS_MODEL_INDEX 4
+#define LZMA_END_POS_MODEL_INDEX 14
+#define LZMA_NUM_FULL_DISTANCES (1 << (LZMA_END_POS_MODEL_INDEX >> 1))
+
+#define LZMA_NUM_POS_SLOT_BITS 6
+#define LZMA_NUM_LEN_TO_POS_STATES 4
+
+#define LZMA_NUM_ALIGN_BITS 4
+
+#define LZMA_MATCH_MIN_LEN 2
+
+#define LZMA_IS_MATCH 0
+#define LZMA_IS_REP (LZMA_IS_MATCH + (LZMA_NUM_STATES << LZMA_NUM_POS_BITS_MAX))
+#define LZMA_IS_REP_G0 (LZMA_IS_REP + LZMA_NUM_STATES)
+#define LZMA_IS_REP_G1 (LZMA_IS_REP_G0 + LZMA_NUM_STATES)
+#define LZMA_IS_REP_G2 (LZMA_IS_REP_G1 + LZMA_NUM_STATES)
+#define LZMA_IS_REP_0_LONG (LZMA_IS_REP_G2 + LZMA_NUM_STATES)
+#define LZMA_POS_SLOT (LZMA_IS_REP_0_LONG \
+		       + (LZMA_NUM_STATES << LZMA_NUM_POS_BITS_MAX))
+#define LZMA_SPEC_POS (LZMA_POS_SLOT \
+		       +(LZMA_NUM_LEN_TO_POS_STATES << LZMA_NUM_POS_SLOT_BITS))
+#define LZMA_ALIGN (LZMA_SPEC_POS \
+		    + LZMA_NUM_FULL_DISTANCES - LZMA_END_POS_MODEL_INDEX)
+#define LZMA_LEN_CODER (LZMA_ALIGN + (1 << LZMA_NUM_ALIGN_BITS))
+#define LZMA_REP_LEN_CODER (LZMA_LEN_CODER + LZMA_NUM_LEN_PROBS)
+#define LZMA_LITERAL (LZMA_REP_LEN_CODER + LZMA_NUM_LEN_PROBS)
+
+
+struct writer {
+	uint8_t *buffer;
+	uint8_t previous_byte;
+	size_t buffer_pos;
+	int bufsize;
+	size_t global_pos;
+	long (*flush)(void*, unsigned long);
+	struct lzma_header *header;
+};
+
+struct cstate {
+	int state;
+	uint32_t rep0, rep1, rep2, rep3;
+};
+
+static inline size_t get_pos(struct writer *wr)
+{
+	return
+		wr->global_pos + wr->buffer_pos;
+}
+
+static inline uint8_t peek_old_byte(struct writer *wr, uint32_t offs)
+{
+	if (!wr->flush) {
+		int32_t pos;
+		while (offs > wr->header->dict_size)
+			offs -= wr->header->dict_size;
+		pos = wr->buffer_pos - offs;
+		return wr->buffer[pos];
+	} else {
+		uint32_t pos = wr->buffer_pos - offs;
+		while (pos >= wr->header->dict_size)
+			pos += wr->header->dict_size;
+		return wr->buffer[pos];
+	}
+
+}
+
+static inline int write_byte(struct writer *wr, uint8_t byte)
+{
+	wr->buffer[wr->buffer_pos++] = wr->previous_byte = byte;
+	if (wr->flush && wr->buffer_pos == wr->header->dict_size) {
+		wr->buffer_pos = 0;
+		wr->global_pos += wr->header->dict_size;
+		if (wr->flush((char *)wr->buffer, wr->header->dict_size)
+				!= wr->header->dict_size)
+			return -1;
+	}
+	return 0;
+}
+
+
+static inline int copy_byte(struct writer *wr, uint32_t offs)
+{
+	return write_byte(wr, peek_old_byte(wr, offs));
+}
+
+static inline int copy_bytes(struct writer *wr,
+					 uint32_t rep0, int len)
+{
+	do {
+		if (copy_byte(wr, rep0))
+			return -1;
+		len--;
+	} while (len != 0 && wr->buffer_pos < wr->header->dst_size);
+
+	return len;
+}
+
+static inline int process_bit0(struct writer *wr, struct rc *rc,
+				     struct cstate *cst, uint16_t *p,
+				     int pos_state, uint16_t *prob,
+				     int lc, uint32_t literal_pos_mask) {
+	int mi = 1;
+	rc_update_bit_0(rc, prob);
+	prob = (p + LZMA_LITERAL +
+		(LZMA_LIT_SIZE
+		 * (((get_pos(wr) & literal_pos_mask) << lc)
+		    + (wr->previous_byte >> (8 - lc))))
+		);
+
+	if (cst->state >= LZMA_NUM_LIT_STATES) {
+		int match_byte = peek_old_byte(wr, cst->rep0);
+		do {
+			int bit;
+			uint16_t *prob_lit;
+
+			match_byte <<= 1;
+			bit = match_byte & 0x100;
+			prob_lit = prob + 0x100 + bit + mi;
+			if (rc_get_bit(rc, prob_lit, &mi)) {
+				if (!bit)
+					break;
+			} else {
+				if (bit)
+					break;
+			}
+		} while (mi < 0x100);
+	}
+	while (mi < 0x100) {
+		uint16_t *prob_lit = prob + mi;
+		rc_get_bit(rc, prob_lit, &mi);
+	}
+	if (cst->state < 4)
+		cst->state = 0;
+	else if (cst->state < 10)
+		cst->state -= 3;
+	else
+		cst->state -= 6;
+
+	return write_byte(wr, mi);
+}
+
+static inline int process_bit1(struct writer *wr, struct rc *rc,
+			       struct cstate *cst, uint16_t *p,
+			       int pos_state, uint16_t *prob) {
+	int offset;
+	uint16_t *prob_len;
+	int num_bits;
+	int len;
+
+	rc_update_bit_1(rc, prob);
+	prob = p + LZMA_IS_REP + cst->state;
+	if (rc_is_bit_0(rc, prob)) {
+		rc_update_bit_0(rc, prob);
+		cst->rep3 = cst->rep2;
+		cst->rep2 = cst->rep1;
+		cst->rep1 = cst->rep0;
+		cst->state = cst->state < LZMA_NUM_LIT_STATES ? 0 : 3;
+		prob = p + LZMA_LEN_CODER;
+	} else {
+		rc_update_bit_1(rc, prob);
+		prob = p + LZMA_IS_REP_G0 + cst->state;
+		if (rc_is_bit_0(rc, prob)) {
+			rc_update_bit_0(rc, prob);
+			prob = (p + LZMA_IS_REP_0_LONG
+				+ (cst->state <<
+				   LZMA_NUM_POS_BITS_MAX) +
+				pos_state);
+			if (rc_is_bit_0(rc, prob)) {
+				rc_update_bit_0(rc, prob);
+
+				cst->state = cst->state < LZMA_NUM_LIT_STATES ?
+					9 : 11;
+				return copy_byte(wr, cst->rep0);
+			} else {
+				rc_update_bit_1(rc, prob);
+			}
+		} else {
+			uint32_t distance;
+
+			rc_update_bit_1(rc, prob);
+			prob = p + LZMA_IS_REP_G1 + cst->state;
+			if (rc_is_bit_0(rc, prob)) {
+				rc_update_bit_0(rc, prob);
+				distance = cst->rep1;
+			} else {
+				rc_update_bit_1(rc, prob);
+				prob = p + LZMA_IS_REP_G2 + cst->state;
+				if (rc_is_bit_0(rc, prob)) {
+					rc_update_bit_0(rc, prob);
+					distance = cst->rep2;
+				} else {
+					rc_update_bit_1(rc, prob);
+					distance = cst->rep3;
+					cst->rep3 = cst->rep2;
+				}
+				cst->rep2 = cst->rep1;
+			}
+			cst->rep1 = cst->rep0;
+			cst->rep0 = distance;
+		}
+		cst->state = cst->state < LZMA_NUM_LIT_STATES ? 8 : 11;
+		prob = p + LZMA_REP_LEN_CODER;
+	}
+
+	prob_len = prob + LZMA_LEN_CHOICE;
+	if (rc_is_bit_0(rc, prob_len)) {
+		rc_update_bit_0(rc, prob_len);
+		prob_len = (prob + LZMA_LEN_LOW
+			    + (pos_state <<
+			       LZMA_LEN_NUM_LOW_BITS));
+		offset = 0;
+		num_bits = LZMA_LEN_NUM_LOW_BITS;
+	} else {
+		rc_update_bit_1(rc, prob_len);
+		prob_len = prob + LZMA_LEN_CHOICE_2;
+		if (rc_is_bit_0(rc, prob_len)) {
+			rc_update_bit_0(rc, prob_len);
+			prob_len = (prob + LZMA_LEN_MID
+				    + (pos_state <<
+				       LZMA_LEN_NUM_MID_BITS));
+			offset = 1 << LZMA_LEN_NUM_LOW_BITS;
+			num_bits = LZMA_LEN_NUM_MID_BITS;
+		} else {
+			rc_update_bit_1(rc, prob_len);
+			prob_len = prob + LZMA_LEN_HIGH;
+			offset = ((1 << LZMA_LEN_NUM_LOW_BITS)
+				  + (1 << LZMA_LEN_NUM_MID_BITS));
+			num_bits = LZMA_LEN_NUM_HIGH_BITS;
+		}
+	}
+
+	rc_bit_tree_decode(rc, prob_len, num_bits, &len);
+	len += offset;
+
+	if (cst->state < 4) {
+		int pos_slot;
+
+		cst->state += LZMA_NUM_LIT_STATES;
+		prob =
+			p + LZMA_POS_SLOT +
+			((len <
+			  LZMA_NUM_LEN_TO_POS_STATES ? len :
+			  LZMA_NUM_LEN_TO_POS_STATES - 1)
+			 << LZMA_NUM_POS_SLOT_BITS);
+		rc_bit_tree_decode(rc, prob,
+				   LZMA_NUM_POS_SLOT_BITS,
+				   &pos_slot);
+		if (pos_slot >= LZMA_START_POS_MODEL_INDEX) {
+			int i, mi;
+			num_bits = (pos_slot >> 1) - 1;
+			cst->rep0 = 2 | (pos_slot & 1);
+			if (pos_slot < LZMA_END_POS_MODEL_INDEX) {
+				cst->rep0 <<= num_bits;
+				prob = p + LZMA_SPEC_POS +
+					cst->rep0 - pos_slot - 1;
+			} else {
+				num_bits -= LZMA_NUM_ALIGN_BITS;
+				while (num_bits--)
+					cst->rep0 = (cst->rep0 << 1) |
+						rc_direct_bit(rc);
+				prob = p + LZMA_ALIGN;
+				cst->rep0 <<= LZMA_NUM_ALIGN_BITS;
+				num_bits = LZMA_NUM_ALIGN_BITS;
+			}
+			i = 1;
+			mi = 1;
+			while (num_bits--) {
+				if (rc_get_bit(rc, prob + mi, &mi))
+					cst->rep0 |= i;
+				i <<= 1;
+			}
+		} else
+			cst->rep0 = pos_slot;
+		if (++(cst->rep0) == 0)
+			return 0;
+		if (cst->rep0 > wr->header->dict_size
+				|| cst->rep0 > get_pos(wr))
+			return -1;
+	}
+
+	len += LZMA_MATCH_MIN_LEN;
+
+	return copy_bytes(wr, cst->rep0, len);
+}
+
+
+
+int unlzma(unsigned char *buf, long in_len,
+	   long (*fill)(void*, unsigned long),
+	   long (*flush)(void*, unsigned long),
+	   unsigned char *output,
+	   long *outlen,
+	   long *posp,
+	   void(*error)(char *x))
+{
+	struct lzma_header header;
+	int lc, pb, lp;
+	uint32_t pos_state_mask;
+	uint32_t literal_pos_mask;
+	uint16_t *p;
+	int num_probs;
+	struct rc rc;
+	int i, mi;
+	struct writer wr;
+	struct cstate cst;
+	unsigned char *inbuf;
+	int ret = -1;
+
+	rc.error = error;
+
+	if (buf)
+		inbuf = buf;
+	else
+		inbuf = malloc(LZMA_IOBUF_SIZE);
+	if (!inbuf) {
+		error("Could not allocate input buffer");
+		goto exit_0;
+	}
+
+	cst.state = 0;
+	cst.rep0 = cst.rep1 = cst.rep2 = cst.rep3 = 1;
+
+	wr.header = &header;
+	wr.flush = flush;
+	wr.global_pos = 0;
+	wr.previous_byte = 0;
+	wr.buffer_pos = 0;
+
+	rc_init(&rc, fill, inbuf, in_len);
+
+	for (i = 0; i < sizeof(header); i++) {
+		if (rc.ptr >= rc.buffer_end)
+			rc_read(&rc);
+		((unsigned char *)&header)[i] = *rc.ptr++;
+	}
+
+	if (header.pos >= (9 * 5 * 5)) {
+		error("bad header");
+		goto exit_1;
+	}
+
+	mi = 0;
+	lc = header.pos;
+	while (lc >= 9) {
+		mi++;
+		lc -= 9;
+	}
+	pb = 0;
+	lp = mi;
+	while (lp >= 5) {
+		pb++;
+		lp -= 5;
+	}
+	pos_state_mask = (1 << pb) - 1;
+	literal_pos_mask = (1 << lp) - 1;
+
+	ENDIAN_CONVERT(header.dict_size);
+	ENDIAN_CONVERT(header.dst_size);
+
+	if (header.dict_size == 0)
+		header.dict_size = 1;
+
+	if (output)
+		wr.buffer = output;
+	else {
+		wr.bufsize = MIN(header.dst_size, header.dict_size);
+		wr.buffer = malloc(wr.bufsize);
+	}
+	if (wr.buffer == NULL)
+		goto exit_1;
+
+	num_probs = LZMA_BASE_SIZE + (LZMA_LIT_SIZE << (lc + lp));
+	p = (uint16_t *) malloc(num_probs * sizeof(*p));
+	if (p == NULL)
+		goto exit_2;
+	num_probs = LZMA_LITERAL + (LZMA_LIT_SIZE << (lc + lp));
+	for (i = 0; i < num_probs; i++)
+		p[i] = (1 << RC_MODEL_TOTAL_BITS) >> 1;
+
+	rc_init_code(&rc);
+
+	while (get_pos(&wr) < header.dst_size) {
+		int pos_state =	get_pos(&wr) & pos_state_mask;
+		uint16_t *prob = p + LZMA_IS_MATCH +
+			(cst.state << LZMA_NUM_POS_BITS_MAX) + pos_state;
+		if (rc_is_bit_0(&rc, prob)) {
+			if (process_bit0(&wr, &rc, &cst, p, pos_state, prob,
+					lc, literal_pos_mask)) {
+				error("LZMA data is corrupt");
+				goto exit_3;
+			}
+		} else {
+			if (process_bit1(&wr, &rc, &cst, p, pos_state, prob)) {
+				error("LZMA data is corrupt");
+				goto exit_3;
+			}
+			if (cst.rep0 == 0)
+				break;
+		}
+		if (rc.buffer_size <= 0)
+			goto exit_3;
+	}
+
+	*outlen = get_pos(&wr);
+
+	if (posp)
+		*posp = rc.ptr-rc.buffer;
+	if (!wr.flush || wr.flush(wr.buffer, wr.buffer_pos) == wr.buffer_pos)
+		ret = 0;
+exit_3:
+	free(p);
+exit_2:
+	if (!output)
+		free(wr.buffer);
+exit_1:
+	if (!buf)
+		free(inbuf);
+exit_0:
+	return ret;
+}