Implement CPM support

2024-01-10 16:23:09 -05:00 · 2024-01-10 16:13:30 -05:00 · 2024-01-10 16:01:47 -05:00 · 2024-01-10 15:50:34 -05:00 · 2024-01-10 15:43:15 -05:00 · 2024-01-10 15:25:22 -05:00
15 changed files with 50943 additions and 572 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,87 +1,99 @@
-project(fpng_test)
-
-cmake_minimum_required(VERSION 3.0)
-option(BUILD_X64 "build 64-bit" TRUE)
-option(SSE "SSE 4.1 support" FALSE)
-
-message("Initial BUILD_X64=${BUILD_X64}")
-message("Initial SSE=${SSE}")
-
-if( NOT CMAKE_BUILD_TYPE )
-  set( CMAKE_BUILD_TYPE Release )
-endif()
-
-message( ${PROJECT_NAME} " build type: " ${CMAKE_BUILD_TYPE} )
-
-if (BUILD_X64)
-	message("Building 64-bit")
-else()
-	message("Building 32-bit")
-endif()
-
-if (SSE)
-	message("SSE enabled")
-else()
-	message("SSE disabled")
-endif()
-
-if (NOT MSVC)
-   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
-   set(CMAKE_C_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
-
-   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
-   set(CMAKE_C_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
-
-   set(CMAKE_CXX_FLAGS -std=c++11)
-   set(GCC_COMPILE_FLAGS "-fvisibility=hidden -fPIC -fno-strict-aliasing -D_LARGEFILE64_SOURCE=1 -D_FILE_OFFSET_BITS=64 -Wall -Wextra -Isrc")
-
-   if (NOT BUILD_X64)
-	  set(GCC_COMPILE_FLAGS "${GCC_COMPILE_FLAGS} -m32")
-   endif()
-
-   set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS}")
-   set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}")
-
-   set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${GCC_LINK_FLAGS} -Wl,-rpath .")
-
-   set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} ${GCC_COMPILE_FLAGS}")
-   set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} ${GCC_COMPILE_FLAGS}")
-   set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} ${GCC_COMPILE_FLAGS} -D_DEBUG")
-
-   set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${GCC_COMPILE_FLAGS}")
-   set(CMAKE_CXX_FLAGS_RELEASE  "${CMAKE_CXX_FLAGS_RELEASE} ${GCC_COMPILE_FLAGS}")
-   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${GCC_COMPILE_FLAGS} -D_DEBUG")
-   
-   if (SSE)
-		set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -DFPNG_NO_SSE=0 -msse4.1 -mpclmul")
-		set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DFPNG_NO_SSE=0 -msse4.1 -mpclmul")
-   else()
-	  	set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -DFPNG_NO_SSE=1")
-		set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DFPNG_NO_SSE=1")
-   endif()
-else()
-	set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS}")
-	set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}")
-endif()
-
-set(FPNG_SRC_LIST ${COMMON_SRC_LIST} 
-	src/fpng.cpp
-	src/fpng_test.cpp
-	src/lodepng.cpp
-	)
-
-if (APPLE)
-   set(BIN_DIRECTORY "bin_osx")
-else()
-   set(BIN_DIRECTORY "bin")
-endif()
-
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${BIN_DIRECTORY})
-
-add_executable(fpng_test ${FPNG_SRC_LIST})
-
-if (NOT MSVC)
-   target_link_libraries(fpng_test m pthread)
-endif()
-
-install(TARGETS fpng_test DESTINATION bin)
+project(fpng)
+
+cmake_minimum_required(VERSION 3.0)
+option(BUILD_X64 "build 64-bit" TRUE)
+option(SSE "SSE 4.1 support" FALSE)
+
+message("Initial BUILD_X64=${BUILD_X64}")
+message("Initial SSE=${SSE}")
+
+if( NOT CMAKE_BUILD_TYPE )
+  set( CMAKE_BUILD_TYPE Release )
+endif()
+
+message( ${PROJECT_NAME} " build type: " ${CMAKE_BUILD_TYPE} )
+
+if (BUILD_X64)
+	message("Building 64-bit")
+else()
+	message("Building 32-bit")
+endif()
+
+if (SSE)
+	message("SSE enabled")
+else()
+	message("SSE disabled")
+endif()
+
+if (NOT MSVC)
+   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
+   set(CMAKE_C_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
+
+   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
+   set(CMAKE_C_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
+
+   set(CMAKE_CXX_FLAGS -std=c++11)
+   set(GCC_COMPILE_FLAGS "-fvisibility=hidden -fPIC -fno-strict-aliasing -D_LARGEFILE64_SOURCE=1 -D_FILE_OFFSET_BITS=64 -Wall -Wextra -Isrc")
+
+   if (NOT BUILD_X64)
+	  set(GCC_COMPILE_FLAGS "${GCC_COMPILE_FLAGS} -m32")
+   endif()
+
+   set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS}")
+   set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}")
+
+   set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${GCC_LINK_FLAGS} -Wl,-rpath .")
+
+   set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} ${GCC_COMPILE_FLAGS}")
+   set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} ${GCC_COMPILE_FLAGS}")
+   set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} ${GCC_COMPILE_FLAGS} -D_DEBUG")
+
+   set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${GCC_COMPILE_FLAGS}")
+   set(CMAKE_CXX_FLAGS_RELEASE  "${CMAKE_CXX_FLAGS_RELEASE} ${GCC_COMPILE_FLAGS}")
+   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${GCC_COMPILE_FLAGS} -D_DEBUG")
+   
+   if (SSE)
+		set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -DFPNG_NO_SSE=0 -msse4.1 -mpclmul")
+		set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DFPNG_NO_SSE=0 -msse4.1 -mpclmul")
+   else()
+	  	set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -DFPNG_NO_SSE=1")
+		set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DFPNG_NO_SSE=1")
+   endif()
+else()
+	set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS}")
+	set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}")
+endif()
+
+set(FPNG_SRC_LIST ${COMMON_SRC_LIST} 
+	src/fpng.cpp
+	src/lodepng.cpp
+	src/pvpngreader.cpp
+	)
+
+file(GLOB_RECURSE fpng_HEADERS "src/*.h")
+
+add_library(fpng SHARED ${FPNG_SRC_LIST})
+
+set_target_properties(fpng PROPERTIES LINKER_LANGUAGE CXX)
+
+install(TARGETS ${PROJECT_NAME} DESTINATION lib/${PROJECT_NAME})
+
+install(FILES ${fpng_HEADERS} DESTINATION include/${PROJECT_NAME})
+
+include_directories("src")
+
+if (APPLE)
+   set(BIN_DIRECTORY "bin_osx")
+else()
+   set(BIN_DIRECTORY "bin")
+endif()
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${BIN_DIRECTORY})
+
+add_executable(fpng_test "src/fpng_test.cpp")
+
+if (NOT MSVC)
+   target_link_libraries(fpng_test fpng m pthread)
+endif()
+
+install(TARGETS fpng_test DESTINATION bin)
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # fpng
-fpng is a very fast C++ .PNG image reader/writer for 24/32bpp images. fpng.cpp was written to see just how fast you can write .PNG's without sacrificing too much compression. The files written by fpng conform to the [PNG standard](https://www.w3.org/TR/PNG/), are readable using any PNG decoder, and validate successfully using [pngcheck](https://www.w3.org/TR/PNG/). PNG files written using fpng can also be read using fpng significantly faster than other PNG libraries, due to its explicit use of [Length-Limited Prefix Codes](https://create.stephan-brumme.com/length-limited-prefix-codes/) and an optimized decoder that exploits the properties of these codes.
+fpng is a very fast C++ .PNG image reader/writer for 24/32bpp images. It's a [single source file](src/fpng.h) with no dependencies on any other library. fpng.cpp was written to see just how fast you can write .PNG's without sacrificing too much compression. The files written by fpng conform to the [PNG standard](https://www.w3.org/TR/PNG/), are readable using any PNG decoder, and load or validate successfully using libpng, wuffs, lodepng, stb_image, and [pngcheck](http://www.libpng.org/pub/png/apps/pngcheck.html). PNG files written using fpng can also be read using fpng faster than other PNG libraries, due to its explicit use of [Length-Limited Prefix Codes](https://create.stephan-brumme.com/length-limited-prefix-codes/) and an [optimized decoder](https://fastcompression.blogspot.com/2015/10/huffman-revisited-part-4-multi-bytes.html) that exploits the properties of these codes.

 fpng.cpp compression compared to stb_image_write.h: 12-19x faster with roughly 5-11% avg. smaller files. 

@@ -7,6 +7,8 @@ fpng.cpp decompression compared to stb_image.h: 2.5-3x faster (on fpng compresse

 fpng.cpp compared to libpng: ~23x faster compression, 2.5-3x faster decompression (on fpng compressed PNG's)

+fpng.cpp compared to Wuffs decompression: roughly 10% faster decompression (on fpng compressed PNG's - note Wuffs decompression is in general *extremely* fast)
+
 Here's an example image encoded by fpng (a downsampled version of "bridge" from [here](http://imagecompression.info/test_images/)):
 ![fpng encoded "bridge" image](https://github.com/richgel999/fpng/blob/main/example.png)

@@ -58,7 +60,7 @@ To build, compile from the included .SLN with Visual Studio 2019/2022 or use cma

 Remove "-DSSE=1" on non-x86/x64 systems. The test executable will be in the "bin" or "bin_osx" subdirectory.

-Tested with MSVC 2022/2019/gcc 7.5.0/clang 6.0. I have only tested fpng.cpp on little endian systems. The code is there for big endian, and it should work, but it needs testing.
+Tested with MSVC 2022/2019/gcc 7.5.0/clang 6.0 and 10.0. I have only tested fpng.cpp on little endian systems. The code is there for big endian, and it should work, but it needs testing.

 ## Testing

@@ -129,23 +131,37 @@ For convenience some of the lib's internal functionality is exposed through thes
 namespace fpng {
  bool fpng_cpu_supports_sse41();
  uint32_t fpng_crc32(const void* pData, size_t size, uint32_t prev_crc32 = FPNG_CRC32_INIT);
-  uint32_t fpng_adler32(const uint8_t* ptr, size_t buf_len, uint32_t adler = FPNG_ADLER32_INIT);
+  uint32_t fpng_adler32(const void* pData, size_t size, uint32_t adler = FPNG_ADLER32_INIT);
 }
 ```

+## Python Bindings
+
+They are [here](https://github.com/qrmt/fpng-python). Thanks [Oskar!](https://github.com/qrmt).
+
 ## Notes

-This version of FPNG always uses PNG filter #2 and is limited to only RLE matches (i.e. LZ matches with a match distance of either 3 or 4). It's around 5% weaker than the original release, which used LZRW1 parsing. (I'll eventually add back in the original parser as an option, but doing that will add more code/complexity to the project.)
+- 4/20/2023: I upgraded lodepng, stb_image, and qoi to the latest versions. I also added pvpngreader.cpp/.h for benchmarking, which uses miniz internally for decompression. The relative encoding/decoding performance of QOI vs. PNG in general seems quite dependent on the C/C++ compiler you use. 
+
+pvpngreader.cpp relies on miniz.h for zlib decompression. It's been fuzzed using zzuf and is used in the [Basis Universal repo](https://github.com/binomialLLC/basis_universal) for PNG reading.
+
+lodepng v20230410 fetched 4/20/2023
+
+stb_image.h v2.28 fetched 4/20/2023
+
+stb_image_write.h v1.16 fetched 12/18/2021 (still latest as of 4/20/2023)
+
+qoi.h fetched 4/20/2023
+
+- This version of FPNG always uses PNG filter #2 and is limited to only RLE matches (i.e. LZ matches with a match distance of either 3 or 4). It's around 5% weaker than the original release, which used LZRW1 parsing. (I'll eventually add back in the original parser as an option, but doing that will add more code/complexity to the project.)

 Importantly, the fpng decoder can explictly/purposely only decode PNG files written by fpng, otherwise it returns fpng::FPNG_DECODE_NOT_FPNG (so you can fall back to a general purpose PNG decoder).

 fpng's compressor places a special private ancillary chunk in its output files, which other PNG decompressors will ignore. The decompressor uses this chunk to determine if the file was written by fpng (enabling fast decompression). This chunk's definition is [here](https://github.com/richgel999/fpng/wiki/fdEC-PNG-chunk-definition).

-lodepng v20210627 fetched 12/18/2021
+In single pass mode (the default), fpng uses a set of precomputed Deflate dynamic Huffman tables. Here's [how to use the fpng_test tool to compute custom tables](https://github.com/richgel999/fpng/wiki/How-to-train-new-Huffman-tables-for-custom-content). 

-stb_image_write.h v1.16 fetched 12/18/2021
-
-qoi.h fetched 12/18/2021
+Earlier versions of fpng (before 1.0.5) wrote valid PNG's that wuffs wouldn't accept. As far as I can tell this is a [bug in wuffs](https://github.com/google/wuffs/issues/66). I've added a workaround to fpng's encoder and re-trained its single pass Huffman tables, and I've also added the wuffs decoder to the png_test app.

 ## Low-level description

--- a/fpng.vcxproj
+++ b/fpng.vcxproj
@@ -134,6 +134,7 @@
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -146,6 +147,7 @@
    <ClCompile Include="src/fpng.cpp" />
    <ClCompile Include="src/fpng_test.cpp" />
    <ClCompile Include="src/lodepng.cpp" />
+    <ClCompile Include="src\pvpngreader.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="src/fpng.h" />
@@ -153,6 +155,8 @@
    <ClInclude Include="src/qoi.h" />
    <ClInclude Include="src/stb_image.h" />
    <ClInclude Include="src/stb_image_write.h" />
+    <ClInclude Include="src\basisu_miniz.h" />
+    <ClInclude Include="src\pvpngreader.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/fpng.vcxproj.filters
+++ b/fpng.vcxproj.filters
@@ -24,6 +24,9 @@
    <ClCompile Include="src/lodepng.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="src\pvpngreader.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="src/fpng.h">
@@ -41,5 +44,11 @@
    <ClInclude Include="src/lodepng.h">
      <Filter>Source Files</Filter>
    </ClInclude>
+    <ClInclude Include="src\pvpngreader.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="src\basisu_miniz.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
  </ItemGroup>
 </Project>
--- a/src/basisu_miniz.h
+++ b/src/basisu_miniz.h
--- a/src/fpng.cpp
+++ b/src/fpng.cpp
@@ -1,4 +1,5 @@
-// fpng.cpp - Fast 24/32bpp .PNG image writer/reader. See unlicense at the end of this file.
+// fpng.cpp 1.0.6 - Fast 24/32bpp .PNG image writer/reader. See unlicense at the end of this file.
+// PNG's generated by this code have been tested to load successfully with stb_image.h, lodepng.cpp, wuffs, libpng, and pngcheck.
 //
 // Uses code from the simple PNG writer function by Alex Evans, 2011. Released into the public domain: https://gist.github.com/908299
 // Some low-level Deflate/Huffman functions derived from the original 2011 Google Code version of miniz (public domain by R. Geldreich, Jr.): https://code.google.com/archive/p/miniz/
@@ -290,6 +291,8 @@ namespace fpng
 	}
 #endif

+#if FPNG_X86_OR_X64_CPU && !FPNG_NO_SSE 
+
 #ifndef _MSC_VER
 	static void do_cpuid(uint32_t eax, uint32_t ecx, uint32_t* regs)
 	{
@@ -308,7 +311,6 @@ namespace fpng
 	}
 #endif

-#if FPNG_X86_OR_X64_CPU && !FPNG_NO_SSE 
 	struct cpu_info
 	{
 		cpu_info() { memset(this, 0, sizeof(*this)); }
@@ -400,49 +402,54 @@ namespace fpng

 #if FPNG_X86_OR_X64_CPU && !FPNG_NO_SSE 
 	// See "Fast Computation of Adler32 Checksums":
-	// https ://www.intel.com/content/www/us/en/developer/articles/technical/fast-computation-of-adler32-checksums.html
-	// SSE 4.1, 8 bytes per iteration, 2-2.5x faster than the scalar version.
-	static uint32_t adler32_sse_8(const uint8_t* p, size_t len, uint32_t initial)
+	// https://www.intel.com/content/www/us/en/developer/articles/technical/fast-computation-of-adler32-checksums.html
+	// SSE 4.1, 16 bytes per iteration
+	static uint32_t adler32_sse_16(const uint8_t* p, size_t len, uint32_t initial)
 	{
 		uint32_t s1 = initial & 0xFFFF, s2 = initial >> 16;
 		const uint32_t K = 65521;

-		while (len >= 8)
+		while (len >= 16)
 		{
-			__m128i a = _mm_setr_epi32(s1, 0, 0, 0), b = _mm_setr_epi32(0, 0, 0, 0), c = _mm_setr_epi32(0, 0, 0, 0), d = _mm_setr_epi32(0, 0, 0, 0);
+			__m128i a = _mm_setr_epi32(s1, 0, 0, 0), b = _mm_setzero_si128(), c = _mm_setzero_si128(), d = _mm_setzero_si128(), 
+				e = _mm_setzero_si128(), f = _mm_setzero_si128(), g = _mm_setzero_si128(), h = _mm_setzero_si128();

-			const size_t n = minimum<size_t>(len >> 3, 5552);
+			const size_t n = minimum<size_t>(len >> 4, 5552);

 			for (size_t i = 0; i < n; i++)
 			{
-				a = _mm_add_epi32(a, _mm_cvtepu8_epi32(_mm_set1_epi32(((const uint32_t*)p)[i * 2 + 0])));
-				b = _mm_add_epi32(b, a);
-				c = _mm_add_epi32(c, _mm_cvtepu8_epi32(_mm_set1_epi32(((const uint32_t*)p)[i * 2 + 1])));
-				d = _mm_add_epi32(d, c);
+				const __m128i v = _mm_loadu_si128((const __m128i*)(p + i * 16));
+				a = _mm_add_epi32(a, _mm_cvtepu8_epi32(_mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 0, 0)))); b = _mm_add_epi32(b, a);
+				c = _mm_add_epi32(c, _mm_cvtepu8_epi32(_mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 1, 1)))); d = _mm_add_epi32(d, c);
+				e = _mm_add_epi32(e, _mm_cvtepu8_epi32(_mm_shuffle_epi32(v, _MM_SHUFFLE(2, 2, 2, 2)))); f = _mm_add_epi32(f, e);
+				g = _mm_add_epi32(g, _mm_cvtepu8_epi32(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 3, 3)))); h = _mm_add_epi32(h, g);
 			}

-			uint32_t sa[8], sb[8];
-			_mm_storeu_si128((__m128i *)sa, a); _mm_storeu_si128((__m128i *)(sa + 4), c);
-			_mm_storeu_si128((__m128i *)sb, b); _mm_storeu_si128((__m128i *)(sb + 4), d);
+			uint32_t sa[16], sb[16];
+			_mm_storeu_si128((__m128i*)sa, a); _mm_storeu_si128((__m128i*)(sa + 4), c);
+			_mm_storeu_si128((__m128i*)sb, b); _mm_storeu_si128((__m128i*)(sb + 4), d);
+			_mm_storeu_si128((__m128i*)(sa + 8), e); _mm_storeu_si128((__m128i*)(sa + 12), g);
+			_mm_storeu_si128((__m128i*)(sb + 8), f); _mm_storeu_si128((__m128i*)(sb + 12), h);

+			// This could be vectorized, but it's only executed every 5552*16 iterations.
 			uint64_t vs1 = 0;
-			for (uint32_t i = 0; i < 8; i++)
+			for (uint32_t i = 0; i < 16; i++)
 				vs1 += sa[i];

 			uint64_t vs2_a = 0;
-			for (uint32_t i = 0; i < 8; i++)
+			for (uint32_t i = 0; i < 16; i++)
 				vs2_a += sa[i] * (uint64_t)i;
 			uint64_t vs2_b = 0;
-			for (uint32_t i = 0; i < 8; i++)
+			for (uint32_t i = 0; i < 16; i++)
 				vs2_b += sb[i];
-			vs2_b *= 8U;
+			vs2_b *= 16U;
 			uint64_t vs2 = vs2_b - vs2_a + s2;

 			s1 = (uint32_t)(vs1 % K);
 			s2 = (uint32_t)(vs2 % K);

-			p += n * 8;
-			len -= n * 8;
+			p += n * 16;
+			len -= n * 16;
 		}

 		for (; len; len--)
@@ -470,13 +477,13 @@ namespace fpng
 		return (s2 << 16) + s1;
 	}

-	uint32_t fpng_adler32(const uint8_t* ptr, size_t buf_len, uint32_t adler)
+	uint32_t fpng_adler32(const void* pData, size_t size, uint32_t adler)
 	{
 #if FPNG_X86_OR_X64_CPU && !FPNG_NO_SSE 
 		if (g_cpu_info.can_use_sse41())
-			return adler32_sse_8(ptr, buf_len, adler);
+			return adler32_sse_16((const uint8_t*)pData, size, adler);
 #endif
-		return fpng_adler32_scalar(ptr, buf_len, adler);
+		return fpng_adler32_scalar((const uint8_t*)pData, size, adler);
 	}

 	// Ensure we've been configured for endianness correctly.
@@ -520,56 +527,38 @@ namespace fpng
 		
 	static const uint32_t g_bitmasks[17] = { 0x0000, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F, 0x00FF, 0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF };

-	static const uint8_t g_dyn_huff_3[] = { 120, 1, 229, 194, 3, 176, 37, 75, 148, 5, 208, 189, 79, 102, 86, 213, 197, 99, 187, 231, 143, 109, 219, 182, 109, 219, 182, 109, 219, 182, 109, 219,
-		198, 31, 207, 159, 118, 63, 94, 84, 85, 102, 158, 61, 21, 241, 34, 58, 38, 198, 102, 196 };
-	const uint32_t DYN_HUFF_3_BITBUF = 0x2, DYN_HUFF_3_BITBUF_SIZE = 3;
-		
-	static const struct { uint8_t m_code_size; uint16_t m_code; } g_dyn_huff_3_codes[288] =
-	{
-		{3,0x0},{3,0x4},{4,0x6},{5,0x1},{5,0x11},{5,0x9},{6,0xD},{6,0x2D},{6,0x1D},{7,0x33},{7,0x73},{7,0xB},{7,0x4B},{8,0x3B},{8,0xBB},{8,0x7B},
-		{8,0xFB},{8,0x7},{8,0x87},{9,0x97},{9,0x197},{9,0x57},{9,0x157},{9,0xD7},{9,0x1D7},{9,0x37},{9,0x137},{12,0x24F},{10,0x18F},{12,0xA4F},{12,0x64F},{12,0xE4F},
-		{12,0x14F},{12,0x94F},{12,0x54F},{12,0xD4F},{12,0x34F},{12,0xB4F},{12,0x74F},{12,0xF4F},{12,0xCF},{12,0x8CF},{12,0x4CF},{12,0xCCF},{12,0x2CF},{12,0xACF},{12,0x6CF},{12,0xECF},
-		{12,0x1CF},{12,0x9CF},{12,0x5CF},{12,0xDCF},{12,0x3CF},{12,0xBCF},{12,0x7CF},{12,0xFCF},{12,0x2F},{12,0x82F},{12,0x42F},{12,0xC2F},{12,0x22F},{12,0xA2F},{12,0x62F},{12,0xE2F},
-		{12,0x12F},{12,0x92F},{12,0x52F},{12,0xD2F},{12,0x32F},{12,0xB2F},{12,0x72F},{12,0xF2F},{12,0xAF},{12,0x8AF},{12,0x4AF},{12,0xCAF},{12,0x2AF},{12,0xAAF},{12,0x6AF},{12,0xEAF},
-		{12,0x1AF},{12,0x9AF},{12,0x5AF},{12,0xDAF},{12,0x3AF},{12,0xBAF},{12,0x7AF},{12,0xFAF},{12,0x6F},{12,0x86F},{12,0x46F},{12,0xC6F},{12,0x26F},{12,0xA6F},{12,0x66F},{12,0xE6F},
-		{12,0x16F},{12,0x96F},{12,0x56F},{12,0xD6F},{12,0x36F},{12,0xB6F},{12,0x76F},{12,0xF6F},{12,0xEF},{12,0x8EF},{12,0x4EF},{12,0xCEF},{12,0x2EF},{12,0xAEF},{12,0x6EF},{12,0xEEF},
-		{12,0x1EF},{12,0x9EF},{12,0x5EF},{12,0xDEF},{12,0x3EF},{12,0xBEF},{12,0x7EF},{12,0xFEF},{12,0x1F},{12,0x81F},{12,0x41F},{12,0xC1F},{12,0x21F},{12,0xA1F},{12,0x61F},{12,0xE1F},
-		{12,0x11F},{12,0x91F},{12,0x51F},{12,0xD1F},{12,0x31F},{12,0xB1F},{12,0x71F},{12,0xF1F},{12,0x9F},{12,0x89F},{12,0x49F},{12,0xC9F},{12,0x29F},{12,0xA9F},{12,0x69F},{12,0xE9F},
-		{12,0x19F},{12,0x99F},{12,0x59F},{12,0xD9F},{12,0x39F},{12,0xB9F},{12,0x79F},{12,0xF9F},{12,0x5F},{12,0x85F},{12,0x45F},{12,0xC5F},{12,0x25F},{12,0xA5F},{12,0x65F},{12,0xE5F},
-		{12,0x15F},{12,0x95F},{12,0x55F},{12,0xD5F},{12,0x35F},{12,0xB5F},{12,0x75F},{12,0xF5F},{12,0xDF},{12,0x8DF},{12,0x4DF},{12,0xCDF},{12,0x2DF},{12,0xADF},{12,0x6DF},{12,0xEDF},
-		{12,0x1DF},{12,0x9DF},{12,0x5DF},{12,0xDDF},{12,0x3DF},{12,0xBDF},{12,0x7DF},{12,0xFDF},{12,0x3F},{12,0x83F},{12,0x43F},{12,0xC3F},{12,0x23F},{12,0xA3F},{12,0x63F},{12,0xE3F},
-		{12,0x13F},{12,0x93F},{12,0x53F},{12,0xD3F},{12,0x33F},{12,0xB3F},{12,0x73F},{12,0xF3F},{12,0xBF},{12,0x8BF},{12,0x4BF},{12,0xCBF},{12,0x2BF},{12,0xABF},{12,0x6BF},{12,0xEBF},
-		{12,0x1BF},{12,0x9BF},{12,0x5BF},{12,0xDBF},{12,0x3BF},{12,0xBBF},{12,0x7BF},{12,0xFBF},{12,0x7F},{12,0x87F},{12,0x47F},{10,0x38F},{12,0xC7F},{12,0x27F},{12,0xA7F},{12,0x67F},
-		{12,0xE7F},{12,0x17F},{12,0x97F},{12,0x57F},{10,0x4F},{12,0xD7F},{9,0xB7},{9,0x1B7},{9,0x77},{9,0x177},{9,0xF7},{9,0x1F7},{9,0xF},{9,0x10F},{8,0x47},{8,0xC7},
-		{8,0x27},{8,0xA7},{8,0x67},{8,0xE7},{7,0x2B},{7,0x6B},{7,0x1B},{7,0x5B},{6,0x3D},{6,0x3},{6,0x23},{5,0x19},{5,0x5},{5,0x15},{4,0xE},{3,0x2},
-		{12,0x37F},{6,0x13},{0,0x0},{0,0x0},{8,0x17},{0,0x0},{0,0x0},{9,0x8F},{0,0x0},{12,0xB7F},{0,0x0},{12,0x77F},{12,0xF7F},{12,0xFF},{12,0x8FF},{12,0x4FF},
-		{12,0xCFF},{12,0x2FF},{12,0xAFF},{12,0x6FF},{12,0xEFF},{12,0x1FF},{12,0x9FF},{12,0x5FF},{12,0xDFF},{12,0x3FF},{12,0xBFF},{12,0x7FF},{12,0xFFF},{0,0x0},{0,0x0},{0,0x0}
+	// Huffman tables generated by fpng_test -t @filelist.txt. Total alpha files : 1440, Total opaque files : 5627.
+	// Feel free to retrain the encoder on your opaque/alpha PNG files by setting FPNG_TRAIN_HUFFMAN_TABLES and running fpng_test with the -t option.
+	static const uint8_t g_dyn_huff_3[] = {
+	120, 1, 237, 195, 3, 176, 110, 89, 122, 128, 225, 247, 251, 214, 218, 248, 113, 124, 173, 190, 109, 12, 50, 201, 196, 182, 109, 219, 182, 109, 219, 182,
+	109, 219, 201, 36, 147, 153, 105, 235, 246, 53, 142, 207, 143, 141, 181, 214, 151, 93, 117, 170, 78, 117, 117, 58, 206, 77, 210, 217, 169, 122 };
+	const uint32_t DYN_HUFF_3_BITBUF = 30, DYN_HUFF_3_BITBUF_SIZE = 7;
+	static const struct { uint8_t m_code_size; uint16_t m_code; } g_dyn_huff_3_codes[288] = {
+	{2,0},{4,2},{4,10},{5,14},{5,30},{6,25},{6,57},{6,5},{6,37},{7,3},{7,67},{7,35},{7,99},{8,11},{8,139},{8,75},{8,203},{8,43},{8,171},{8,107},{9,135},{9,391},{9,71},{9,327},{9,199},{9,455},{9,39},{9,295},{9,167},{9,423},{9,103},{10,183},
+	{9,359},{10,695},{10,439},{10,951},{10,119},{10,631},{10,375},{10,887},{10,247},{10,759},{10,503},{11,975},{11,1999},{11,47},{11,1071},{12,1199},{11,559},{12,3247},{12,687},{11,1583},{12,2735},{12,1711},{12,3759},{12,431},{12,2479},{12,1455},{12,3503},{12,943},{12,2991},{12,1967},{12,4015},{12,111},
+	{12,2159},{12,1135},{12,3183},{12,623},{12,2671},{12,1647},{12,3695},{12,367},{12,2415},{12,1391},{12,3439},{12,879},{12,2927},{12,1903},{12,3951},{12,239},{12,2287},{12,1263},{12,3311},{12,751},{12,2799},{12,1775},{12,3823},{12,495},{12,2543},{12,1519},{12,3567},{12,1007},{12,3055},{12,2031},{12,4079},{12,31},
+	{12,2079},{12,1055},{12,3103},{12,543},{12,2591},{12,1567},{12,3615},{12,287},{12,2335},{12,1311},{12,3359},{12,799},{12,2847},{12,1823},{12,3871},{12,159},{12,2207},{12,1183},{12,3231},{12,671},{12,2719},{12,1695},{12,3743},{12,415},{12,2463},{12,1439},{12,3487},{12,927},{12,2975},{12,1951},{12,3999},{12,95},
+	{12,2143},{12,1119},{12,3167},{12,607},{12,2655},{12,1631},{12,3679},{12,351},{12,2399},{12,1375},{12,3423},{12,863},{12,2911},{12,1887},{12,3935},{12,223},{12,2271},{12,1247},{12,3295},{12,735},{12,2783},{12,1759},{12,3807},{12,479},{12,2527},{12,1503},{12,3551},{12,991},{12,3039},{12,2015},{12,4063},{12,63},
+	{12,2111},{12,1087},{12,3135},{12,575},{12,2623},{12,1599},{12,3647},{12,319},{12,2367},{12,1343},{12,3391},{12,831},{12,2879},{12,1855},{12,3903},{12,191},{12,2239},{12,1215},{12,3263},{12,703},{12,2751},{12,1727},{12,3775},{12,447},{12,2495},{12,1471},{12,3519},{12,959},{12,3007},{12,1983},{12,4031},{12,127},
+	{12,2175},{12,1151},{12,3199},{12,639},{12,2687},{12,1663},{12,3711},{12,383},{12,2431},{12,1407},{12,3455},{12,895},{12,2943},{11,303},{12,1919},{12,3967},{11,1327},{12,255},{11,815},{11,1839},{11,175},{10,1015},{10,15},{10,527},{10,271},{10,783},{10,143},{10,655},{10,399},{10,911},{10,79},{10,591},
+	{9,231},{10,335},{9,487},{9,23},{9,279},{9,151},{9,407},{9,87},{9,343},{9,215},{9,471},{9,55},{8,235},{8,27},{8,155},{8,91},{8,219},{8,59},{8,187},{8,123},{7,19},{7,83},{7,51},{7,115},{6,21},{6,53},{6,13},{6,45},{5,1},{5,17},{5,9},{4,6},
+	{12,2303},{6,29},{0,0},{0,0},{8,251},{0,0},{0,0},{8,7},{0,0},{10,847},{0,0},{10,207},{12,1279},{10,719},{12,3327},{12,767},{12,2815},{12,1791},{12,3839},{12,511},{12,2559},{12,1535},{9,311},{12,3583},{12,1023},{12,3071},{10,463},{12,2047},{6,61},{12,4095},{0,0},{0,0}
 	};

-	static const uint8_t g_dyn_huff_4[] = { 120,1,229,195,83,144,37,219,182,0,208,49,87,230,70,177,171,121,204,171,103,219,182,109,219,182,109,219,182,109,219,214,
-		197,177,154,213,197,141,204,53,95,228,71,69,116,156,56,207,126,251,99 };
-	const uint32_t DYN_HUFF_4_BITBUF = 0x0, DYN_HUFF_4_BITBUF_SIZE = 2;
-
-	static const struct { uint8_t m_code_size; uint16_t m_code; } g_dyn_huff_4_codes[288] =
-	{
-		{1,0x0},{4,0x1},{5,0x5},{6,0xD},{6,0x2D},{7,0x23},{7,0x63},{7,0x13},{7,0x53},{8,0x6B},{8,0xEB},{8,0x1B},{8,0x9B},{8,0x5B},{8,0xDB},{9,0xA7},
-		{8,0x3B},{9,0x1A7},{9,0x67},{9,0x167},{9,0xE7},{9,0x1E7},{9,0x17},{10,0x137},{10,0x337},{10,0xB7},{10,0x2B7},{10,0x1B7},{10,0x3B7},{10,0x77},{10,0x277},{10,0x177},
-		{10,0x377},{10,0xF7},{10,0x2F7},{11,0x34F},{11,0x74F},{11,0xCF},{11,0x4CF},{11,0x2CF},{12,0x7CF},{12,0xFCF},{12,0x2F},{12,0x82F},{12,0x42F},{12,0xC2F},{12,0x22F},{12,0xA2F},
-		{12,0x62F},{12,0xE2F},{12,0x12F},{12,0x92F},{12,0x52F},{12,0xD2F},{12,0x32F},{12,0xB2F},{12,0x72F},{12,0xF2F},{12,0xAF},{12,0x8AF},{12,0x4AF},{12,0xCAF},{12,0x2AF},{12,0xAAF},
-		{12,0x6AF},{12,0xEAF},{12,0x1AF},{12,0x9AF},{12,0x5AF},{12,0xDAF},{12,0x3AF},{12,0xBAF},{12,0x7AF},{12,0xFAF},{12,0x6F},{12,0x86F},{12,0x46F},{12,0xC6F},{12,0x26F},{12,0xA6F},
-		{12,0x66F},{12,0xE6F},{12,0x16F},{12,0x96F},{12,0x56F},{12,0xD6F},{12,0x36F},{12,0xB6F},{12,0x76F},{12,0xF6F},{12,0xEF},{12,0x8EF},{12,0x4EF},{12,0xCEF},{12,0x2EF},{12,0xAEF},
-		{12,0x6EF},{12,0xEEF},{12,0x1EF},{12,0x9EF},{12,0x5EF},{12,0xDEF},{12,0x3EF},{12,0xBEF},{12,0x7EF},{12,0xFEF},{12,0x1F},{12,0x81F},{12,0x41F},{12,0xC1F},{12,0x21F},{12,0xA1F},
-		{12,0x61F},{12,0xE1F},{12,0x11F},{12,0x91F},{12,0x51F},{12,0xD1F},{12,0x31F},{12,0xB1F},{12,0x71F},{12,0xF1F},{12,0x9F},{12,0x89F},{12,0x49F},{12,0xC9F},{12,0x29F},{12,0xA9F},
-		{12,0x69F},{12,0xE9F},{12,0x19F},{12,0x99F},{12,0x59F},{12,0xD9F},{12,0x39F},{12,0xB9F},{12,0x79F},{12,0xF9F},{12,0x5F},{12,0x85F},{12,0x45F},{12,0xC5F},{12,0x25F},{12,0xA5F},
-		{12,0x65F},{12,0xE5F},{12,0x15F},{12,0x95F},{12,0x55F},{12,0xD5F},{12,0x35F},{12,0xB5F},{12,0x75F},{12,0xF5F},{12,0xDF},{12,0x8DF},{12,0x4DF},{12,0xCDF},{12,0x2DF},{12,0xADF},
-		{12,0x6DF},{12,0xEDF},{12,0x1DF},{12,0x9DF},{12,0x5DF},{12,0xDDF},{12,0x3DF},{12,0xBDF},{12,0x7DF},{12,0xFDF},{12,0x3F},{12,0x83F},{12,0x43F},{12,0xC3F},{12,0x23F},{12,0xA3F},
-		{12,0x63F},{12,0xE3F},{12,0x13F},{12,0x93F},{12,0x53F},{12,0xD3F},{12,0x33F},{12,0xB3F},{12,0x73F},{12,0xF3F},{12,0xBF},{12,0x8BF},{12,0x4BF},{12,0xCBF},{12,0x2BF},{12,0xABF},
-		{12,0x6BF},{12,0xEBF},{12,0x1BF},{12,0x9BF},{12,0x5BF},{12,0xDBF},{12,0x3BF},{12,0xBBF},{12,0x7BF},{12,0xFBF},{12,0x7F},{12,0x87F},{12,0x47F},{12,0xC7F},{12,0x27F},{12,0xA7F},
-		{12,0x67F},{12,0xE7F},{12,0x17F},{12,0x97F},{12,0x57F},{12,0xD7F},{12,0x37F},{12,0xB7F},{12,0x77F},{12,0xF7F},{12,0xFF},{11,0x6CF},{11,0x1CF},{11,0x5CF},{11,0x3CF},{10,0x1F7},
-		{10,0x3F7},{10,0xF},{10,0x20F},{10,0x10F},{10,0x30F},{10,0x8F},{10,0x28F},{10,0x18F},{10,0x38F},{10,0x4F},{9,0x117},{9,0x97},{9,0x197},{9,0x57},{9,0x157},{9,0xD7},
-		{8,0xBB},{9,0x1D7},{8,0x7B},{8,0xFB},{8,0x7},{8,0x87},{8,0x47},{8,0xC7},{7,0x33},{7,0x73},{7,0xB},{7,0x4B},{6,0x1D},{6,0x3D},{5,0x15},{4,0x9},
-		{12,0x8FF},{0,0x0},{6,0x3},{0,0x0},{0,0x0},{0,0x0},{8,0x27},{0,0x0},{0,0x0},{9,0x37},{0,0x0},{10,0x24F},{0,0x0},{10,0x14F},{12,0x4FF},{12,0xCFF},
-		{12,0x2FF},{12,0xAFF},{12,0x6FF},{12,0xEFF},{12,0x1FF},{12,0x9FF},{12,0x5FF},{12,0xDFF},{12,0x3FF},{12,0xBFF},{12,0x7FF},{12,0xFFF},{7,0x2B},{0,0x0},{0,0x0},{0,0x0},
+	static const uint8_t g_dyn_huff_4[] = {
+	120, 1, 229, 196, 99, 180, 37, 103, 218, 128, 225, 251, 121, 171, 106, 243, 216, 231, 180, 109, 196, 182, 51, 51, 73, 6, 201, 216, 182, 109, 219, 182,
+	17, 140, 98, 219, 102, 219, 60, 125, 172, 205, 170, 122, 159, 111, 213, 143, 179, 214, 94, 189, 58, 153, 104, 166, 103, 190, 247, 199, 117 };
+	const uint32_t DYN_HUFF_4_BITBUF = 1, DYN_HUFF_4_BITBUF_SIZE = 2;
+	static const struct { uint8_t m_code_size; uint16_t m_code; } g_dyn_huff_4_codes[288] = {
+	{2,0},{4,2},{5,6},{6,30},{6,62},{6,1},{7,41},{7,105},{7,25},{7,89},{7,57},{7,121},{8,117},{8,245},{8,13},{8,141},{8,77},{8,205},{8,45},{8,173},{8,109},{8,237},{8,29},{8,157},{8,93},{8,221},{8,61},{9,83},{9,339},{9,211},{9,467},{9,51},
+	{9,307},{9,179},{9,435},{9,115},{9,371},{9,243},{9,499},{9,11},{9,267},{9,139},{9,395},{9,75},{9,331},{9,203},{9,459},{9,43},{9,299},{10,7},{10,519},{10,263},{10,775},{10,135},{10,647},{10,391},{10,903},{10,71},{10,583},{10,327},{10,839},{10,199},{10,711},{10,455},
+	{10,967},{10,39},{10,551},{10,295},{10,807},{10,167},{10,679},{10,423},{10,935},{10,103},{10,615},{11,463},{11,1487},{11,975},{10,359},{10,871},{10,231},{11,1999},{11,47},{11,1071},{11,559},{10,743},{10,487},{11,1583},{11,303},{11,1327},{11,815},{11,1839},{11,175},{11,1199},{11,687},{11,1711},
+	{11,431},{11,1455},{11,943},{11,1967},{11,111},{11,1135},{11,623},{11,1647},{11,367},{11,1391},{11,879},{11,1903},{11,239},{11,1263},{11,751},{11,1775},{11,495},{11,1519},{11,1007},{11,2031},{11,31},{11,1055},{11,543},{11,1567},{11,287},{11,1311},{11,799},{11,1823},{11,159},{11,1183},{11,671},{11,1695},
+	{11,415},{11,1439},{11,927},{11,1951},{11,95},{11,1119},{11,607},{11,1631},{11,351},{11,1375},{11,863},{11,1887},{11,223},{11,1247},{11,735},{11,1759},{11,479},{11,1503},{11,991},{11,2015},{11,63},{11,1087},{11,575},{11,1599},{11,319},{11,1343},{11,831},{11,1855},{11,191},{11,1215},{11,703},{11,1727},
+	{11,447},{11,1471},{11,959},{11,1983},{11,127},{11,1151},{11,639},{11,1663},{11,383},{10,999},{10,23},{10,535},{10,279},{11,1407},{11,895},{11,1919},{11,255},{11,1279},{10,791},{10,151},{10,663},{10,407},{10,919},{10,87},{10,599},{10,343},{10,855},{10,215},{10,727},{10,471},{10,983},{10,55},
+	{10,567},{10,311},{10,823},{10,183},{10,695},{10,439},{10,951},{10,119},{10,631},{10,375},{10,887},{10,247},{10,759},{10,503},{10,1015},{10,15},{10,527},{10,271},{10,783},{10,143},{10,655},{10,399},{9,171},{9,427},{9,107},{9,363},{9,235},{9,491},{9,27},{9,283},{9,155},{9,411},
+	{9,91},{9,347},{9,219},{9,475},{9,59},{9,315},{9,187},{9,443},{8,189},{9,123},{8,125},{8,253},{8,3},{8,131},{8,67},{8,195},{8,35},{8,163},{8,99},{8,227},{8,19},{7,5},{7,69},{7,37},{7,101},{7,21},{7,85},{6,33},{6,17},{6,49},{5,22},{4,10},
+	{12,2047},{0,0},{6,9},{0,0},{0,0},{0,0},{8,147},{0,0},{0,0},{7,53},{0,0},{9,379},{0,0},{9,251},{10,911},{10,79},{11,767},{10,591},{10,335},{10,847},{10,207},{10,719},{11,1791},{11,511},{9,507},{11,1535},{11,1023},{12,4095},{5,14},{0,0},{0,0},{0,0}
 	};

 #define PUT_BITS(bb, ll) do { uint32_t b = bb, l = ll; assert((l) >= 0 && (l) <= 16); assert((b) < (1ULL << (l))); bit_buf |= (((uint64_t)(b)) << bit_buf_size); bit_buf_size += (l); assert(bit_buf_size <= 64); } while(0)
@@ -601,8 +590,8 @@ namespace fpng
 	enum
 	{
 		DEFL_MAX_HUFF_TABLES = 3,
-		DEFL_MAX_HUFF_SYMBOLS = 288,
-		DEFL_MAX_HUFF_SYMBOLS_0 = 288,
+		DEFL_MAX_HUFF_SYMBOLS = 288,	
+		DEFL_MAX_HUFF_SYMBOLS_0 = 288,	
 		DEFL_MAX_HUFF_SYMBOLS_1 = 32,
 		DEFL_MAX_HUFF_SYMBOLS_2 = 19,
 		DEFL_LZ_DICT_SIZE = 32768,
@@ -611,6 +600,10 @@ namespace fpng
 		DEFL_MAX_MATCH_LEN = 258
 	};

+#if FPNG_TRAIN_HUFFMAN_TABLES
+	uint64_t g_huff_counts[HUFF_COUNTS_SIZE];
+#endif
+
 	struct defl_huff
 	{
 		uint16_t m_huff_count[DEFL_MAX_HUFF_TABLES][DEFL_MAX_HUFF_SYMBOLS];
@@ -755,6 +748,12 @@ do { \
 		int num_lit_codes, num_dist_codes, num_bit_lengths; uint32_t i, total_code_sizes_to_pack, num_packed_code_sizes, rle_z_count, rle_repeat_count, packed_code_sizes_index;
 		uint8_t code_sizes_to_pack[DEFL_MAX_HUFF_SYMBOLS_0 + DEFL_MAX_HUFF_SYMBOLS_1], packed_code_sizes[DEFL_MAX_HUFF_SYMBOLS_0 + DEFL_MAX_HUFF_SYMBOLS_1], prev_code_size = 0xFF;

+#if FPNG_TRAIN_HUFFMAN_TABLES
+		assert(HUFF_COUNTS_SIZE == DEFL_MAX_HUFF_SYMBOLS_0);
+		for (uint32_t i = 0; i < DEFL_MAX_HUFF_SYMBOLS_0; i++)
+			g_huff_counts[i] += d->m_huff_count[0][i];
+#endif
+
 		d->m_huff_count[0][256] = 1;

 		defl_optimize_huffman_table(d, 0, DEFL_MAX_HUFF_SYMBOLS_0, 12, FPNG_FALSE);
@@ -907,6 +906,87 @@ do { \
 		}
 	}

+#if FPNG_TRAIN_HUFFMAN_TABLES
+	bool create_dynamic_block_prefix(uint64_t* pFreq, uint32_t num_chans, std::vector<uint8_t>& prefix, uint64_t& bit_buf, int &bit_buf_size, uint32_t* pCodes, uint8_t* pCodesizes)
+	{
+		assert((num_chans == 3) || (num_chans == 4));
+		assert(HUFF_COUNTS_SIZE == DEFL_MAX_HUFF_SYMBOLS_0); // must be equal
+				
+		defl_huff dh;
+		memset(&dh, 0, sizeof(dh));
+
+		uint32_t lit_freq[DEFL_MAX_HUFF_SYMBOLS_0];
+		
+		uint32_t shift_len = 0;
+		for (; ; )
+		{
+			uint32_t i;
+			for (i = 0; i < DEFL_MAX_HUFF_SYMBOLS_0; i++)
+			{
+				uint64_t f = pFreq[i];
+				if (f)
+					f = maximum<uint64_t>(1U, f >> shift_len);
+
+				if (f > UINT32_MAX)
+					break;
+
+				lit_freq[i] = (uint32_t)pFreq[i];
+			}
+
+			if (i == DEFL_MAX_HUFF_SYMBOLS_0)
+				break;
+			
+			shift_len++;
+		}
+				
+		// Ensure all valid Deflate literal/EOB/length syms are non-zero, so anything can be coded.
+		for (uint32_t i = 0; i <= 256; i++)
+		{
+			if (!lit_freq[i])
+				lit_freq[i] = 1;
+		}
+
+		for (uint32_t len = num_chans; len <= DEFL_MAX_MATCH_LEN; len += num_chans)
+		{
+			uint32_t sym = g_defl_len_sym[len - 3];
+			if (!lit_freq[sym])
+				lit_freq[sym] = 1;
+		}
+
+		adjust_freq32(DEFL_MAX_HUFF_SYMBOLS_0, lit_freq, &dh.m_huff_count[0][0]);
+		
+		const uint32_t dist_sym = g_defl_small_dist_sym[num_chans - 1];
+		dh.m_huff_count[1][dist_sym] = 1;
+		dh.m_huff_count[1][dist_sym + 1] = 1; // to workaround a bug in wuffs decoder
+			
+		prefix.resize(4096);
+		uint8_t* pDst = prefix.data();
+		uint32_t dst_buf_size = (uint32_t)prefix.size();
+
+		uint32_t dst_ofs = 0;
+
+		// zlib header
+		PUT_BITS(0x78, 8);
+		PUT_BITS(0x01, 8);
+
+		// write BFINAL bit
+		PUT_BITS(1, 1);
+				
+		if (!defl_start_dynamic_block(&dh, pDst, dst_ofs, dst_buf_size, bit_buf, bit_buf_size))
+			return false;
+
+		prefix.resize(dst_ofs);
+
+		for (uint32_t i = 0; i < DEFL_MAX_HUFF_SYMBOLS_0; i++)
+		{
+			pCodes[i] = dh.m_huff_codes[0][i];
+			pCodesizes[i] = dh.m_huff_code_sizes[0][i];
+		}
+
+		return true;
+	}
+#endif
+
 	static uint32_t pixel_deflate_dyn_3_rle(
 		const uint8_t* pImg, uint32_t w, uint32_t h,
 		uint8_t* pDst, uint32_t dst_buf_size)
@@ -1015,6 +1095,7 @@ do { \

 		memset(&dh.m_huff_count[1][0], 0, sizeof(dh.m_huff_count[1][0]) * DEFL_MAX_HUFF_SYMBOLS_1);
 		dh.m_huff_count[1][dist_sym] = 1;
+		dh.m_huff_count[1][dist_sym + 1] = 1; // to workaround a bug in wuffs decoder

 		if (!defl_start_dynamic_block(&dh, pDst, dst_ofs, dst_buf_size, bit_buf, bit_buf_size))
 			return 0;
@@ -1293,6 +1374,7 @@ do { \
 		
 		memset(&dh.m_huff_count[1][0], 0, sizeof(dh.m_huff_count[1][0]) * DEFL_MAX_HUFF_SYMBOLS_1);
 		dh.m_huff_count[1][dist_sym] = 1;
+		dh.m_huff_count[1][dist_sym + 1] = 1; // to workaround a bug in wuffs decoder

 		if (!defl_start_dynamic_block(&dh, pDst, dst_ofs, dst_buf_size, bit_buf, bit_buf_size))
 			return 0;
@@ -1527,33 +1609,48 @@ do_literals:
 			// Previous scanline
 			*pDst++ = 2;

-			if (num_chans == 3)
+#if FPNG_X86_OR_X64_CPU && !FPNG_NO_SSE
+			if (g_cpu_info.can_use_sse41())
 			{
-				for (uint32_t x = 0; x < (uint32_t)w; x++)
-				{
-					pDst[0] = (uint8_t)(pSrc[0] - pPrev_src[0]);
-					pDst[1] = (uint8_t)(pSrc[1] - pPrev_src[1]);
-					pDst[2] = (uint8_t)(pSrc[2] - pPrev_src[2]);
+				uint32_t bytes_to_process = w * num_chans, ofs = 0;
+				for (; bytes_to_process >= 16; bytes_to_process -= 16, ofs += 16)
+					_mm_storeu_si128((__m128i*)(pDst + ofs), _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(pSrc + ofs)), _mm_loadu_si128((const __m128i*)(pPrev_src + ofs))));

-					pSrc += 3;
-					pPrev_src += 3;
-					pDst += 3;
-				}
+				for (; bytes_to_process; bytes_to_process--, ofs++)
+					pDst[ofs] = (uint8_t)(pSrc[ofs] - pPrev_src[ofs]);
 			}
 			else
+#endif
 			{
-				for (uint32_t x = 0; x < (uint32_t)w; x++)
+				if (num_chans == 3)
 				{
-					pDst[0] = (uint8_t)(pSrc[0] - pPrev_src[0]);
-					pDst[1] = (uint8_t)(pSrc[1] - pPrev_src[1]);
-					pDst[2] = (uint8_t)(pSrc[2] - pPrev_src[2]);
-					pDst[3] = (uint8_t)(pSrc[3] - pPrev_src[3]);
+					for (uint32_t x = 0; x < (uint32_t)w; x++)
+					{
+						pDst[0] = (uint8_t)(pSrc[0] - pPrev_src[0]);
+						pDst[1] = (uint8_t)(pSrc[1] - pPrev_src[1]);
+						pDst[2] = (uint8_t)(pSrc[2] - pPrev_src[2]);

-					pSrc += 4;
-					pPrev_src += 4;
-					pDst += 4;
+						pSrc += 3;
+						pPrev_src += 3;
+						pDst += 3;
+					}
+				}
+				else
+				{
+					for (uint32_t x = 0; x < (uint32_t)w; x++)
+					{
+						pDst[0] = (uint8_t)(pSrc[0] - pPrev_src[0]);
+						pDst[1] = (uint8_t)(pSrc[1] - pPrev_src[1]);
+						pDst[2] = (uint8_t)(pSrc[2] - pPrev_src[2]);
+						pDst[3] = (uint8_t)(pSrc[3] - pPrev_src[3]);
+
+						pSrc += 4;
+						pPrev_src += 4;
+						pDst += 4;
+					}
 				}
 			}
+
 			break;
 		}
 		default:
@@ -1570,7 +1667,7 @@ do_literals:
 			return false;
 		}

-		if ((w < 1) || (h < 1) || (w * h > UINT32_MAX) || (w > FPNG_MAX_SUPPORTED_DIM) || (h > FPNG_MAX_SUPPORTED_DIM))
+		if ((w < 1) || (h < 1) || (w * (uint64_t)h > UINT32_MAX) || (w > FPNG_MAX_SUPPORTED_DIM) || (h > FPNG_MAX_SUPPORTED_DIM))
 		{
 			assert(0);
 			return false;
@@ -1868,9 +1965,7 @@ do_literals:

 		GET_BITS(num_dist_codes, 5);
 		num_dist_codes += 1;
-		if (num_dist_codes != num_chans)
-			return false;
-
+		
 		uint32_t total_codes = num_lit_codes + num_dist_codes;
 		if (total_codes > (DEFL_MAX_HUFF_SYMBOLS_0 + DEFL_MAX_HUFF_SYMBOLS_1))
 			return false;
@@ -1962,12 +2057,21 @@ do_literals:

 		uint32_t total_valid_distcodes = 0;
 		for (uint32_t i = 0; i < num_dist_codes; i++)
-			total_valid_distcodes += code_sizes[num_lit_codes + i];
-		if (total_valid_distcodes != 1)
+			total_valid_distcodes += (code_sizes[num_lit_codes + i] == 1);
+		
+		// 1 or 2 because the first version of FPNG only issued 1 valid distance code, but that upset wuffs. So we let 1 or 2 through.
+		if ((total_valid_distcodes < 1) || (total_valid_distcodes > 2))
 			return false;

 		if (code_sizes[num_lit_codes + (num_chans - 1)] != 1)
 			return false;
+
+		if (total_valid_distcodes == 2)
+		{
+			// If there are two valid distance codes, make sure the first is 1 bit.
+			if (code_sizes[num_lit_codes + num_chans] != 1)
+				return false;
+		}
 						
 		if (!build_decoder_table(num_lit_codes, lit_codesizes, pLit_table))
 			return false;
@@ -2135,7 +2239,7 @@ do_literals:
 		GET_BITS(bfinal, 1);
 		GET_BITS(btype, 2);

-		// Must be the final block or it's not valid, and type=1 (dynamic)
+		// Must be the final block or it's not valid, and type=2 (dynamic)
 		if ((bfinal != 1) || (btype != 2))
 			return false;
 		
@@ -2513,7 +2617,7 @@ do_literals:
 		GET_BITS(bfinal, 1);
 		GET_BITS(btype, 2);

-		// Must be the final block or it's not valid, and type=1 (dynamic)
+		// Must be the final block or it's not valid, and type=2 (dynamic)
 		if ((bfinal != 1) || (btype != 2))
 			return false;

--- a/src/fpng.h
+++ b/src/fpng.h
@@ -5,9 +5,14 @@
 #include <stdint.h>
 #include <vector>

+#ifndef FPNG_TRAIN_HUFFMAN_TABLES
+	// Set to 1 when using the -t (training) option in fpng_test to generate new opaque/alpha Huffman tables for the single pass encoder.
+	#define FPNG_TRAIN_HUFFMAN_TABLES (0)
+#endif
+
 namespace fpng
 {
-	// ---- Library initialization - call once to identify if the process supports SSE.
+	// ---- Library initialization - call once to identify if the processor supports SSE.
 	// Otherwise you'll only get scalar fallbacks.
 	void fpng_init();

@@ -23,7 +28,7 @@ namespace fpng

 	// Fast Adler32 SSE4.1 Adler-32 with a scalar fallback.
 	const uint32_t FPNG_ADLER32_INIT = 1;
-	uint32_t fpng_adler32(const uint8_t* ptr, size_t buf_len, uint32_t adler = FPNG_ADLER32_INIT);
+	uint32_t fpng_adler32(const void* pData, size_t size, uint32_t adler = FPNG_ADLER32_INIT);

 	// ---- Compression
 	enum
@@ -106,4 +111,12 @@ namespace fpng
 	int fpng_decode_file(const char* pFilename, std::vector<uint8_t>& out, uint32_t& width, uint32_t& height, uint32_t& channels_in_file, uint32_t desired_channels);
 #endif

+	// ---- Internal API used for Huffman table training purposes
+
+#if FPNG_TRAIN_HUFFMAN_TABLES
+	const uint32_t HUFF_COUNTS_SIZE = 288;
+	extern uint64_t g_huff_counts[HUFF_COUNTS_SIZE];
+	bool create_dynamic_block_prefix(uint64_t* pFreq, uint32_t num_chans, std::vector<uint8_t>& prefix, uint64_t& bit_buf, int& bit_buf_size, uint32_t *pCodes, uint8_t *pCodesizes);
+#endif
+
 } // namespace fpng
--- a/src/fpng_test.cpp
+++ b/src/fpng_test.cpp
@@ -23,6 +23,13 @@
 #define QOI_IMPLEMENTATION
 #include "qoi.h"

+#define WUFFS_IMPLEMENTATION
+#define WUFFS_CONFIG__STATIC_FUNCTIONS
+#include "wuffs-v0.3.c"
+
+#include "basisu_miniz.h"
+#include "pvpngreader.h"
+
 typedef std::vector<uint8_t> uint8_vec;

 typedef uint64_t timer_ticks;
@@ -272,7 +279,11 @@ static void write_func_stbi(void* context, void* data, int size)
 static bool load_listing_file(const std::string& f, std::vector<std::string>& filenames)
 {
 	std::string filename(f);
-	//filename.erase(0, 1);
+	if (filename.size() == 0)
+		return false;
+
+	if (filename[0] == '@')
+		filename.erase(0, 1);

 	FILE* pFile = nullptr;
 #ifdef _WIN32
@@ -670,6 +681,297 @@ static int fuzz_test_encoder2(uint32_t fpng_flags)
 	return EXIT_SUCCESS;
 }

+static void* wuffs_decode(void* pData, size_t data_len, uint32_t &width, uint32_t &height) 
+{
+	wuffs_png__decoder* pDec = wuffs_png__decoder__alloc();
+	if (!pDec) 
+		return nullptr;
+
+	wuffs_png__decoder__set_quirk_enabled(pDec, WUFFS_BASE__QUIRK_IGNORE_CHECKSUM, true);
+
+	wuffs_base__image_config ic;
+	wuffs_base__io_buffer src = wuffs_base__ptr_u8__reader((uint8_t *)pData, data_len, true);
+	wuffs_base__status status = wuffs_png__decoder__decode_image_config(pDec, &ic, &src);
+	
+	if (status.repr) 
+	{
+		free(pDec);
+		return nullptr;
+	}
+
+	width = wuffs_base__pixel_config__width(&ic.pixcfg);
+	height = wuffs_base__pixel_config__height(&ic.pixcfg);
+
+	wuffs_base__pixel_config__set(&ic.pixcfg, WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL, WUFFS_BASE__PIXEL_SUBSAMPLING__NONE, width, height);
+
+	uint64_t workbuf_len = wuffs_png__decoder__workbuf_len(pDec).max_incl;
+	if (workbuf_len > SIZE_MAX) 
+	{
+		free(pDec);
+		return nullptr;
+	}
+
+	wuffs_base__slice_u8 workbuf_slice = wuffs_base__make_slice_u8( (uint8_t *)malloc((size_t)workbuf_len), (size_t)workbuf_len); 
+	if (!workbuf_slice.ptr) 
+	{
+		free(pDec);
+		return nullptr;
+	}
+
+	const uint64_t total_pixels = (uint64_t)width * (uint64_t)height;
+	if (total_pixels > (SIZE_MAX >> 2U)) 
+	{
+		free(workbuf_slice.ptr);
+		free(pDec);
+		return nullptr;
+	}
+
+	void* pDecode_buf = malloc((size_t)(total_pixels * sizeof(uint32_t)));
+	if (!pDecode_buf)
+	{
+		free(workbuf_slice.ptr);
+		free(pDec);
+		return nullptr;
+	}
+
+	wuffs_base__slice_u8 pixbuf_slice = wuffs_base__make_slice_u8((uint8_t*)pDecode_buf, (size_t)(total_pixels * sizeof(uint32_t)));
+
+	wuffs_base__pixel_buffer pb;
+	status = wuffs_base__pixel_buffer__set_from_slice(&pb, &ic.pixcfg, pixbuf_slice);
+	
+	if (status.repr) 
+	{
+		free(workbuf_slice.ptr);
+		free(pDecode_buf);
+		free(pDec);
+		return nullptr;
+	}
+
+	status = wuffs_png__decoder__decode_frame(pDec, &pb, &src, WUFFS_BASE__PIXEL_BLEND__SRC, workbuf_slice, NULL);
+	
+	if (status.repr) 
+	{
+		free(workbuf_slice.ptr);
+		free(pDecode_buf);
+		free(pDec);
+		return nullptr;
+	}
+			
+	free(workbuf_slice.ptr);
+	free(pDec);
+
+	return pDecode_buf;
+}
+
+#if FPNG_TRAIN_HUFFMAN_TABLES
+static int training_mode(const char* pFilename)
+{
+	if (pFilename[0] != '@')
+	{
+		fprintf(stderr, "Must specify list of files to read using @filelist.txt\n");
+		return EXIT_FAILURE;
+	}
+
+	std::vector<std::string> files_to_process;
+
+	if (!load_listing_file(std::string(pFilename), files_to_process))
+		return EXIT_FAILURE;
+
+	uint64_t opaque_freq[fpng::HUFF_COUNTS_SIZE], alpha_freq[fpng::HUFF_COUNTS_SIZE];
+	memset(opaque_freq, 0, sizeof(opaque_freq));
+	memset(alpha_freq, 0, sizeof(alpha_freq));
+
+	uint32_t total_alpha_files = 0, total_opaque_files = 0, total_failed_loading = 0;
+
+	for (uint32_t file_index = 0; file_index < files_to_process.size(); file_index++)
+	{
+		const char* pFilename = files_to_process[file_index].c_str();
+
+		printf("Processing file \"%s\"\n", pFilename);
+
+		uint8_vec source_file_data;
+		if (!read_file_to_vec(pFilename, source_file_data))
+		{
+			fprintf(stderr, "Failed reading source file data \"%s\"\n", pFilename);
+			return EXIT_FAILURE;
+		}
+
+		uint32_t source_width = 0, source_height = 0;
+		uint8_t* pSource_image_buffer = nullptr;
+		unsigned error = lodepng_decode_memory(&pSource_image_buffer, &source_width, &source_height, source_file_data.data(), source_file_data.size(), LCT_RGBA, 8);
+		if (error != 0)
+		{
+			fprintf(stderr, "WARNING: Failed unpacking source file \"%s\" using lodepng! Skipping.\n", pFilename);
+			total_failed_loading++;
+			continue;
+		}
+
+		const color_rgba* pSource_pixels32 = (const color_rgba*)pSource_image_buffer;
+		uint32_t total_source_pixels = source_width * source_height;
+		bool has_alpha = false;
+		for (uint32_t i = 0; i < total_source_pixels; i++)
+		{
+			if (pSource_pixels32[i].m_c[3] < 255)
+			{
+				has_alpha = true;
+				break;
+			}
+		}
+
+		const uint32_t source_chans = has_alpha ? 4 : 3;
+
+		printf("Dimensions: %ux%u, Has Alpha: %u, Total Pixels: %u, bytes: %u (%f MB)\n", source_width, source_height, has_alpha, total_source_pixels, total_source_pixels * source_chans, total_source_pixels * source_chans / (1024.0f * 1024.0f));
+
+		uint8_vec source_image_buffer24(total_source_pixels * 3);
+		for (uint32_t i = 0; i < total_source_pixels; i++)
+		{
+			source_image_buffer24[i * 3 + 0] = pSource_pixels32[i].m_c[0];
+			source_image_buffer24[i * 3 + 1] = pSource_pixels32[i].m_c[1];
+			source_image_buffer24[i * 3 + 2] = pSource_pixels32[i].m_c[2];
+		}
+		const uint8_t* pSource_pixels24 = source_image_buffer24.data();
+
+		memset(fpng::g_huff_counts, 0, sizeof(fpng::g_huff_counts));
+
+		std::vector<uint8_t> fpng_file_buf;
+		bool status = fpng::fpng_encode_image_to_memory((source_chans == 4) ? (const void*)pSource_pixels32 : (const void*)pSource_pixels24, source_width, source_height, source_chans, fpng_file_buf, fpng::FPNG_ENCODE_SLOWER);
+		if (!status)
+		{
+			fprintf(stderr, "fpng_encode_image_to_memory() failed!\n");
+			return EXIT_FAILURE;
+		}
+
+		// Sanity check the PNG file using lodepng
+		{
+			uint32_t lodepng_decoded_w = 0, lodepng_decoded_h = 0;
+			uint8_t* lodepng_decoded_buffer = nullptr;
+
+			int error = lodepng_decode_memory(&lodepng_decoded_buffer, &lodepng_decoded_w, &lodepng_decoded_h, (uint8_t*)fpng_file_buf.data(), fpng_file_buf.size(), LCT_RGBA, 8);
+			if (error != 0)
+			{
+				fprintf(stderr, "lodepng_decode_memory() failed!\n");
+				return EXIT_FAILURE;
+			}
+
+			if (memcmp(lodepng_decoded_buffer, pSource_pixels32, total_source_pixels * 4) != 0)
+			{
+				fprintf(stderr, "FPNG decode verification failed (using lodepng)!\n");
+				return EXIT_FAILURE;
+			}
+			free(lodepng_decoded_buffer);
+		}
+
+		if (source_chans == 4)
+		{
+			for (uint32_t i = 0; i < fpng::HUFF_COUNTS_SIZE; i++)
+				alpha_freq[i] += fpng::g_huff_counts[i];
+
+			total_alpha_files++;
+		}
+		else
+		{
+			for (uint32_t i = 0; i < fpng::HUFF_COUNTS_SIZE; i++)
+				opaque_freq[i] += fpng::g_huff_counts[i];
+
+			total_opaque_files++;
+		}
+
+	} // filename_index
+
+	printf("Total alpha files: %u\n", total_alpha_files);
+	printf("Total opaque files: %u\n", total_opaque_files);
+	printf("Total failed loading: %u\n", total_failed_loading);
+
+	if (!total_alpha_files && !total_opaque_files)
+	{
+		fprintf(stderr, "No failed were loaded!\n");
+		return EXIT_FAILURE;
+	}
+
+	if (total_opaque_files)
+	{
+		std::vector<uint8_t> dyn_prefix;
+		uint64_t bit_buf = 0;
+		int bit_buf_size = 0;
+		uint32_t codes[fpng::HUFF_COUNTS_SIZE];
+		uint8_t codesizes[fpng::HUFF_COUNTS_SIZE];
+		
+		bool status = fpng::create_dynamic_block_prefix(opaque_freq, 3, dyn_prefix, bit_buf, bit_buf_size, codes, codesizes);
+		if (!status)
+		{
+			fprintf(stderr, "fpng::create_dynamic_block_prefix() failed!\n");
+			return EXIT_FAILURE;
+		}
+
+		printf("\n");
+		printf("static const uint8_t g_dyn_huff_3[] = {\n");
+		for (uint32_t i = 0; i < dyn_prefix.size(); i++) 
+		{ 
+			printf("%u%c ", dyn_prefix[i], (i != (dyn_prefix.size() - 1)) ? ',' : ' '); 
+			if ((i & 31) == 31) 
+				printf("\n"); 
+		}
+		printf("};\n");
+		printf("const uint32_t DYN_HUFF_3_BITBUF = %u, DYN_HUFF_3_BITBUF_SIZE = %u;\n", (uint32_t)bit_buf, (uint32_t)bit_buf_size);
+
+		printf("static const struct { uint8_t m_code_size; uint16_t m_code; } g_dyn_huff_3_codes[288] = {\n");
+		for (uint32_t i = 0; i < fpng::HUFF_COUNTS_SIZE; i++)
+		{
+			printf("{%u,%u}%c", codesizes[i], codes[i], (i != (fpng::HUFF_COUNTS_SIZE - 1)) ? ',' : ' ');
+			if ((i & 31) == 31)
+				printf("\n");
+		}
+		printf("};\n");
+	}
+
+	if (total_alpha_files)
+	{
+		std::vector<uint8_t> dyn_prefix;
+		uint64_t bit_buf = 0;
+		int bit_buf_size = 0;
+		uint32_t codes[fpng::HUFF_COUNTS_SIZE];
+		uint8_t codesizes[fpng::HUFF_COUNTS_SIZE];
+		bool status = fpng::create_dynamic_block_prefix(alpha_freq, 4, dyn_prefix, bit_buf, bit_buf_size, codes, codesizes);
+		if (!status)
+		{
+			fprintf(stderr, "fpng::create_dynamic_block_prefix() failed!\n");
+			return EXIT_FAILURE;
+		}
+
+		printf("\n");
+		printf("static const uint8_t g_dyn_huff_4[] = {\n");
+		for (uint32_t i = 0; i < dyn_prefix.size(); i++)
+		{
+			printf("%u%c ", dyn_prefix[i], (i != (dyn_prefix.size() - 1)) ? ',' : ' ');
+			if ((i & 31) == 31)
+				printf("\n");
+		}
+		printf("};\n");
+		printf("const uint32_t DYN_HUFF_4_BITBUF = %u, DYN_HUFF_4_BITBUF_SIZE = %u;\n", (uint32_t)bit_buf, (uint32_t)bit_buf_size);
+
+		printf("static const struct { uint8_t m_code_size; uint16_t m_code; } g_dyn_huff_4_codes[288] = {\n");
+		for (uint32_t i = 0; i < fpng::HUFF_COUNTS_SIZE; i++)
+		{
+			printf("{%u,%u}%c", codesizes[i], codes[i], (i != (fpng::HUFF_COUNTS_SIZE - 1)) ? ',' : ' ');
+			if ((i & 31) == 31)
+				printf("\n");
+		}
+		printf("};\n");
+	}
+
+	return EXIT_SUCCESS;
+}
+#else
+static int training_mode(const char* pFilename)
+{
+	(void)pFilename;
+
+	fprintf(stderr, "Must compile with FPNG_TRAIN_HUFFMAN_TABLES set to 1\n");
+
+	return EXIT_FAILURE;
+}
+#endif
+
 int main(int arg_c, char **arg_v)
 {
 	fpng::fpng_init();
@@ -686,6 +988,7 @@ int main(int arg_c, char **arg_v)
 		printf("-e: Fuzz encoder/decoder by randomly modifying an input image's pixels\n");
 		printf("-f: Decompress specified PNG image using FPNG, then exit\n");
 		printf("-a: Swizzle input image's green to alpha, for testing 32bpp correlation alpha\n");
+		printf("-t: Train Huffman tables on @filelist.txt (must compile with FPNG_TRAIN_HUFFMAN_TABLES=1)\n");
 		return EXIT_FAILURE;
 	}

@@ -698,6 +1001,7 @@ int main(int arg_c, char **arg_v)
 	bool fuzz_encoder2 = false;
 	bool fuzz_decoder = false;
 	bool swizzle_green_to_alpha = false;
+	bool training_mode_flag = false;

 	for (int i = 1; i < arg_c; i++)
 	{
@@ -732,6 +1036,10 @@ int main(int arg_c, char **arg_v)
 			{
 				swizzle_green_to_alpha = true;
 			}
+			else if (pArg[1] == 't')
+			{
+				training_mode_flag = true;
+			}
 			else
 			{
 				fprintf(stderr, "Unrecognized option: %s\n", pArg);
@@ -762,6 +1070,9 @@ int main(int arg_c, char **arg_v)
 	if (fuzz_encoder2)
 		return fuzz_test_encoder2(fpng_flags);

+	if (training_mode_flag)
+		return training_mode(pFilename);
+
 	if (!csv_flag)
 	{
 		printf("SSE 4.1 supported: %u\n", fpng::fpng_cpu_supports_sse41());
@@ -867,7 +1178,7 @@ int main(int arg_c, char **arg_v)

 	const uint8_t* pSource_pixels24 = source_image_buffer24.data();
 	
-	const uint32_t NUM_TIMES_TO_ENCODE = csv_flag ? 3 : 1;
+	const uint32_t NUM_TIMES_TO_ENCODE = csv_flag ? 3 : 3;
 	const uint32_t NUM_TIMES_TO_DECODE = 5;
 	interval_timer tm;

@@ -920,7 +1231,7 @@ int main(int arg_c, char **arg_v)
 #endif
 	}
 	
-	double fpng_decode_time = 0.0f, lodepng_decode_time = 0.0f, stbi_decode_time = 0.0f, qoi_decode_time = 0.0f;
+	double fpng_decode_time = 0.0f, lodepng_decode_time = 0.0f, stbi_decode_time = 0.0f, qoi_decode_time = 0.0f, wuffs_decode_time = 0.0f, pvpng_decode_time = 0.0f;

 	// Decode the file using our decompressor
 	{
@@ -1087,6 +1398,51 @@ int main(int arg_c, char **arg_v)
 		}
 		free(p);
 	}
+
+	// Verify FPNG's output data using wuffs
+	{
+		void* p = nullptr;
+
+		//static void* 
+
+		wuffs_decode_time = 1e+9f;
+		for (uint32_t i = 0; i < NUM_TIMES_TO_DECODE; i++)
+		{
+			if (p)
+			{
+				free(p);
+				p = nullptr;
+			}
+
+			tm.start();
+			
+			uint32_t w, h;
+			p = wuffs_decode(fpng_file_buf.data(), fpng_file_buf.size(), w, h);
+			if (!p)
+				break;
+
+			if ((w != source_width) || (h != source_height))
+			{
+				fprintf(stderr, "wuffs failed decompressing FPNG's output PNG file!\n");
+				return EXIT_FAILURE;
+			}
+
+			wuffs_decode_time = minimum(wuffs_decode_time, tm.get_elapsed_secs());
+		}
+
+		if (!p)
+		{
+			fprintf(stderr, "wuffs failed decompressing FPNG's output PNG file!\n");
+			return EXIT_FAILURE;
+		}
+
+		if (memcmp(p, pSource_pixels32, total_source_pixels * 4) != 0)
+		{
+			fprintf(stderr, "FPNG decode verification failed (using wuffs)!\n");
+			return EXIT_FAILURE;
+		}
+		free(p);
+	}
 		
 	// Compress with lodepng

@@ -1141,7 +1497,7 @@ int main(int arg_c, char **arg_v)
 	}

 	if (!csv_flag)
-		printf("stbi:    %4.6f secs, %u bytes, %4.3f MB, %4.3f MP/s\n", stbi_best_time, (uint32_t)stbi_file_buf.size(), (double)stbi_file_buf.size() / (1024.0f * 1024.0f), (total_source_pixels / (1024.0f * 1024.0f)) / stbi_best_time);
+		printf("stbi:    %4.6f secs, %u bytes, %4.3f MB, %4.3f MP/sec\n", stbi_best_time, (uint32_t)stbi_file_buf.size(), (double)stbi_file_buf.size() / (1024.0f * 1024.0f), (total_source_pixels / (1024.0f * 1024.0f)) / stbi_best_time);
 	
 	if (!csv_flag)
 	{
@@ -1189,29 +1545,75 @@ int main(int arg_c, char **arg_v)
 		
 	// Validate QOI's output file
 	{
+		qoi_decode_time = 1e+9f;
+
 		qoi_desc qddesc;
-		tm.start();
-		void* pQOI_decomp_data = qoi_decode(pQOI_data, qoi_len, &qddesc, 4);
-		qoi_decode_time = tm.get_elapsed_secs();
-				
-		if (memcmp(pQOI_decomp_data, pSource_pixels32, total_source_pixels * 4) != 0)
+		for (uint32_t i = 0; i < NUM_TIMES_TO_ENCODE; i++)
 		{
-			fprintf(stderr, "QOI verification failure!\n");
-			return EXIT_FAILURE;
+			tm.start();
+			void* pQOI_decomp_data = qoi_decode(pQOI_data, qoi_len, &qddesc, 4);
+
+			qoi_decode_time = minimum(qoi_decode_time, tm.get_elapsed_secs());
+
+			if (memcmp(pQOI_decomp_data, pSource_pixels32, total_source_pixels * 4) != 0)
+			{
+				fprintf(stderr, "QOI verification failure!\n");
+				return EXIT_FAILURE;
+			}
+
+			free(pQOI_decomp_data);
 		}
-		free(pQOI_decomp_data);
 	}

 	free(pQOI_data);
 	pQOI_data = nullptr;
+		
+	{
+		// Decode the PNG file using pvpng, which ships with BasisU and uses miniz for decompression.
+
+		pvpng_decode_time = 1e+9f;
+
+		for (uint32_t i = 0; i < NUM_TIMES_TO_ENCODE; i++)
+		{
+			uint32_t width = 0, height = 0, num_chans = 0;
+
+			tm.start();
+
+			void* pImage_data = pv_png::load_png(fpng_file_buf.data(), fpng_file_buf.size(), source_chans, width, height, num_chans);
+
+			pvpng_decode_time = minimum(pvpng_decode_time, tm.get_elapsed_secs());
+
+			if (!pImage_data)
+			{
+				fprintf(stderr, "Failed decoding using pvpng! (1)\n");
+				return EXIT_FAILURE;
+			}
+
+			if ((num_chans != source_chans) || (width != source_width) || (height != source_height))
+			{
+				fprintf(stderr, "Failed decoding using pvpng! (2)\n");
+				return EXIT_FAILURE;
+			}
+
+			if (memcmp((source_chans == 3) ? (const void*)pSource_pixels24 : (const void*)pSource_pixels32, pImage_data, width * height * source_chans) != 0)
+			{
+				fprintf(stderr, "Failed decoding using pvpng! (3)\n");
+				return EXIT_FAILURE;
+			}
+
+			free(pImage_data);
+		}
+	}

 	if (!csv_flag)
 	{
 		printf("** Decoding:\n");
-		printf("FPNG:    %3.6f secs, %4.3f MP/s\n", fpng_decode_time, (total_source_pixels / (1024.0f * 1024.0f)) / fpng_decode_time);
-		printf("lodepng: %3.6f secs, %4.3f MP/s\n", lodepng_decode_time, (total_source_pixels / (1024.0f * 1024.0f)) / lodepng_decode_time);
-		printf("stbi:    %3.6f secs, %4.3f MP/s\n", stbi_decode_time, (total_source_pixels / (1024.0f * 1024.0f)) / stbi_decode_time);
-		printf("qoi:     %3.6f secs, %4.3f MP/s\n", qoi_decode_time, (total_source_pixels / (1024.0f * 1024.0f)) / qoi_decode_time);
+		printf("FPNG:    %3.6f secs, %4.3f MP/sec\n", fpng_decode_time, (total_source_pixels / (1024.0f * 1024.0f)) / fpng_decode_time);
+		printf("lodepng: %3.6f secs, %4.3f MP/sec\n", lodepng_decode_time, (total_source_pixels / (1024.0f * 1024.0f)) / lodepng_decode_time);
+		printf("stbi:    %3.6f secs, %4.3f MP/sec\n", stbi_decode_time, (total_source_pixels / (1024.0f * 1024.0f)) / stbi_decode_time);
+		printf("wuffs:   %3.6f secs, %4.3f MP/sec\n", wuffs_decode_time, (total_source_pixels / (1024.0f * 1024.0f)) / wuffs_decode_time);
+		printf("pvpng:   %3.6f secs, %4.3f MP/sec\n", pvpng_decode_time, (total_source_pixels / (1024.0f * 1024.0f)) / pvpng_decode_time);
+		printf("qoi:     %3.6f secs, %4.3f MP/sec\n", qoi_decode_time, (total_source_pixels / (1024.0f * 1024.0f)) / qoi_decode_time);
 	}

 	if (csv_flag)
@@ -1220,12 +1622,13 @@ int main(int arg_c, char **arg_v)

 		const double source_megapixels = total_source_pixels / (1024.0f * 1024.0f);

-		printf("%s, %u, %u, %u,    %f, %f, %f, %4.1f, %4.1f,    %f, %f, %f, %4.1f, %4.1f,    %f, %f, %f, %4.1f, %4.1f,    %f, %f, %f, %4.1f, %4.1f\n",
+		printf("%s, %u, %u, %u,    %f, %f, %f, %4.3f, %4.3f,    %f, %f, %f, %4.3f, %4.3f,    %f, %f, %f, %4.3f, %4.3f,    %f, %f, %f, %4.3f, %4.3f,   %4.3f, %4.3f\n",
 			pFilename, source_width, source_height, source_chans,
 			qoi_best_time, (double)qoi_len / MB, qoi_decode_time, source_megapixels / qoi_best_time, source_megapixels / qoi_decode_time,
 			fpng_best_time, (double)fpng_file_buf.size() / MB, fpng_decode_time, source_megapixels / fpng_best_time, source_megapixels / fpng_decode_time,
 			lodepng_best_time, (double)lodepng_file_buf.size() / MB, lodepng_decode_time, source_megapixels / lodepng_best_time, source_megapixels / lodepng_decode_time,
-			stbi_best_time, (double)stbi_file_buf.size() / MB, stbi_decode_time, source_megapixels / stbi_best_time, source_megapixels / stbi_decode_time
+			stbi_best_time, (double)stbi_file_buf.size() / MB, stbi_decode_time, source_megapixels / stbi_best_time, source_megapixels / stbi_decode_time,
+			pvpng_decode_time, source_megapixels / pvpng_decode_time
 			);
 	}

--- a/src/lodepng.cpp
+++ b/src/lodepng.cpp
--- a/src/lodepng.h
+++ b/src/lodepng.h
@@ -1,7 +1,7 @@
 /*
-LodePNG version 20210627
+LodePNG version 20230410

-Copyright (c) 2005-2021 Lode Vandevenne
+Copyright (c) 2005-2023 Lode Vandevenne

 This software is provided 'as-is', without any express or implied
 warranty. In no event will the authors be held liable for any damages
@@ -35,43 +35,50 @@ The following #defines are used to create code sections. They can be disabled
 to disable code sections, which can give faster compile time and smaller binary.
 The "NO_COMPILE" defines are designed to be used to pass as defines to the
 compiler command to disable them without modifying this header, e.g.
-DLODEPNG_NO_COMPILE_ZLIB for gcc.
-In addition to those below, you can also define LODEPNG_NO_COMPILE_CRC to
-allow implementing a custom lodepng_crc32.
+-DLODEPNG_NO_COMPILE_ZLIB for gcc or clang.
 */
 /*deflate & zlib. If disabled, you must specify alternative zlib functions in
 the custom_zlib field of the compress and decompress settings*/
 #ifndef LODEPNG_NO_COMPILE_ZLIB
+/*pass -DLODEPNG_NO_COMPILE_ZLIB to the compiler to disable this, or comment out LODEPNG_COMPILE_ZLIB below*/
 #define LODEPNG_COMPILE_ZLIB
 #endif

 /*png encoder and png decoder*/
 #ifndef LODEPNG_NO_COMPILE_PNG
+/*pass -DLODEPNG_NO_COMPILE_PNG to the compiler to disable this, or comment out LODEPNG_COMPILE_PNG below*/
 #define LODEPNG_COMPILE_PNG
 #endif

 /*deflate&zlib decoder and png decoder*/
 #ifndef LODEPNG_NO_COMPILE_DECODER
+/*pass -DLODEPNG_NO_COMPILE_DECODER to the compiler to disable this, or comment out LODEPNG_COMPILE_DECODER below*/
 #define LODEPNG_COMPILE_DECODER
 #endif

 /*deflate&zlib encoder and png encoder*/
 #ifndef LODEPNG_NO_COMPILE_ENCODER
+/*pass -DLODEPNG_NO_COMPILE_ENCODER to the compiler to disable this, or comment out LODEPNG_COMPILE_ENCODER below*/
 #define LODEPNG_COMPILE_ENCODER
 #endif

 /*the optional built in harddisk file loading and saving functions*/
 #ifndef LODEPNG_NO_COMPILE_DISK
+/*pass -DLODEPNG_NO_COMPILE_DISK to the compiler to disable this, or comment out LODEPNG_COMPILE_DISK below*/
 #define LODEPNG_COMPILE_DISK
 #endif

 /*support for chunks other than IHDR, IDAT, PLTE, tRNS, IEND: ancillary and unknown chunks*/
 #ifndef LODEPNG_NO_COMPILE_ANCILLARY_CHUNKS
+/*pass -DLODEPNG_NO_COMPILE_ANCILLARY_CHUNKS to the compiler to disable this,
+or comment out LODEPNG_COMPILE_ANCILLARY_CHUNKS below*/
 #define LODEPNG_COMPILE_ANCILLARY_CHUNKS
 #endif

 /*ability to convert error numerical codes to English text string*/
 #ifndef LODEPNG_NO_COMPILE_ERROR_TEXT
+/*pass -DLODEPNG_NO_COMPILE_ERROR_TEXT to the compiler to disable this,
+or comment out LODEPNG_COMPILE_ERROR_TEXT below*/
 #define LODEPNG_COMPILE_ERROR_TEXT
 #endif

@@ -79,12 +86,27 @@ the custom_zlib field of the compress and decompress settings*/
 you can define the functions lodepng_free, lodepng_malloc and lodepng_realloc in your
 source files with custom allocators.*/
 #ifndef LODEPNG_NO_COMPILE_ALLOCATORS
+/*pass -DLODEPNG_NO_COMPILE_ALLOCATORS to the compiler to disable the built-in ones,
+or comment out LODEPNG_COMPILE_ALLOCATORS below*/
 #define LODEPNG_COMPILE_ALLOCATORS
 #endif

+/*Disable built-in CRC function, in that case a custom implementation of
+lodepng_crc32 must be defined externally so that it can be linked in.
+The default built-in CRC code comes with 8KB of lookup tables, so for memory constrained environment you may want it
+disabled and provide a much smaller implementation externally as said above. You can find such an example implementation
+in a comment in the lodepng.c(pp) file in the 'else' case of the searchable LODEPNG_COMPILE_CRC section.*/
+#ifndef LODEPNG_NO_COMPILE_CRC
+/*pass -DLODEPNG_NO_COMPILE_CRC to the compiler to disable the built-in one,
+or comment out LODEPNG_COMPILE_CRC below*/
+#define LODEPNG_COMPILE_CRC
+#endif
+
 /*compile the C++ version (you can disable the C++ wrapper here even when compiling for C++)*/
 #ifdef __cplusplus
 #ifndef LODEPNG_NO_COMPILE_CPP
+/*pass -DLODEPNG_NO_COMPILE_CPP to the compiler to disable C++ (not needed if a C-only compiler),
+or comment out LODEPNG_COMPILE_CPP below*/
 #define LODEPNG_COMPILE_CPP
 #endif
 #endif
@@ -374,8 +396,10 @@ typedef struct LodePNGColorMode {

  The alpha channels must be set as well, set them to 255 for opaque images.

-  When decoding, by default you can ignore this palette, since LodePNG already
-  fills the palette colors in the pixels of the raw RGBA output.
+  When decoding, with the default settings you can ignore this palette, since
+  LodePNG already fills the palette colors in the pixels of the raw RGBA output,
+  but when decoding to the original PNG color mode it is needed to reconstruct
+  the colors.

  The palette is only supported for color type 3.
  */
@@ -465,10 +489,12 @@ typedef struct LodePNGInfo {
  with values truncated to the bit depth in the unsigned integer.

  For grayscale and palette PNGs, the value is stored in background_r. The values
-  in background_g and background_b are then unused.
+  in background_g and background_b are then unused. The decoder will set them
+  equal to background_r, the encoder ignores them in this case.

-  So when decoding, you may get these in a different color mode than the one you requested
-  for the raw pixels.
+  When decoding, you may get these in a different color mode than the one you requested
+  for the raw pixels: the colortype and bitdepth defined by info_png.color, that is the
+  ones defined in the header of the PNG image, are used.

  When encoding with auto_convert, you must use the color model defined in info_png.color for
  these values. The encoder normally ignores info_png.color when auto_convert is on, but will
@@ -535,7 +561,7 @@ typedef struct LodePNGInfo {
  unsigned phys_unit; /*may be 0 (unknown unit) or 1 (metre)*/

  /*
-  Color profile related chunks: gAMA, cHRM, sRGB, iCPP
+  Color profile related chunks: gAMA, cHRM, sRGB, iCPP, sBIT

  LodePNG does not apply any color conversions on pixels in the encoder or decoder and does not interpret these color
  profile values. It merely passes on the information. If you wish to use color profiles and convert colors, please
@@ -598,6 +624,45 @@ typedef struct LodePNGInfo {
  unsigned char* iccp_profile;
  unsigned iccp_profile_size; /* The size of iccp_profile in bytes */

+  /*
+  sBIT chunk: significant bits. Optional metadata, only set this if needed.
+
+  If defined, these values give the bit depth of the original data. Since PNG only stores 1, 2, 4, 8 or 16-bit
+  per channel data, the significant bits value can be used to indicate the original encoded data has another
+  sample depth, such as 10 or 12.
+
+  Encoders using this value, when storing the pixel data, should use the most significant bits
+  of the data to store the original bits, and use a good sample depth scaling method such as
+  "left bit replication" to fill in the least significant bits, rather than fill zeroes.
+
+  Decoders using this value, if able to work with data that's e.g. 10-bit or 12-bit, should right
+  shift the data to go back to the original bit depth, but decoders are also allowed to ignore
+  sbit and work e.g. with the 8-bit or 16-bit data from the PNG directly, since thanks
+  to the encoder contract, the values encoded in PNG are in valid range for the PNG bit depth.
+
+  For grayscale images, sbit_g and sbit_b are not used, and for images that don't use color
+  type RGBA or grayscale+alpha, sbit_a is not used (it's not used even for palette images with
+  translucent palette values, or images with color key). The values that are used must be
+  greater than zero and smaller than or equal to the PNG bit depth.
+
+  The color type from the header in the PNG image defines these used and unused fields: if
+  decoding with a color mode conversion, such as always decoding to RGBA, this metadata still
+  only uses the color type of the original PNG, and may e.g. lack the alpha channel info
+  if the PNG was RGB. When encoding with auto_convert (as well as without), also always the
+  color model defined in info_png.color determines this.
+
+  NOTE: enabling sbit can hurt compression, because the encoder can then not always use
+  auto_convert to choose a more optimal color mode for the data, because the PNG format has
+  strict requirements for the allowed sbit values in combination with color modes.
+  For example, setting these fields to 10-bit will force the encoder to keep using a 16-bit per channel
+  color mode, even if the pixel data would in fact fit in a more efficient 8-bit mode.
+  */
+  unsigned sbit_defined; /*is significant bits given? if not, the values below are unused*/
+  unsigned sbit_r;       /*red or gray component of significant bits*/
+  unsigned sbit_g;       /*green component of significant bits*/
+  unsigned sbit_b;       /*blue component of significant bits*/
+  unsigned sbit_a;       /*alpha component of significant bits*/
+
  /* End of color profile related chunks */


@@ -770,7 +835,11 @@ typedef struct LodePNGEncoderSettings {
  const unsigned char* predefined_filters;

  /*force creating a PLTE chunk if colortype is 2 or 6 (= a suggested palette).
-  If colortype is 3, PLTE is _always_ created.*/
+  If colortype is 3, PLTE is always created. If color type is explicitely set
+  to a grayscale type (1 or 4), this is not done and is ignored. If enabling this,
+  a palette must be present in the info_png.
+  NOTE: enabling this may worsen compression if auto_convert is used to choose
+  optimal color mode, because it cannot use grayscale color modes in this case*/
  unsigned force_palette;
 #ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
  /*add LodePNG identifier and version as a text chunk, for debugging*/
@@ -824,8 +893,8 @@ unsigned lodepng_inspect(unsigned* w, unsigned* h,
 #endif /*LODEPNG_COMPILE_DECODER*/

 /*
-Reads one metadata chunk (other than IHDR) of the PNG file and outputs what it
-read in the state. Returns error code on failure.
+Reads one metadata chunk (other than IHDR, which is handled by lodepng_inspect)
+of the PNG file and outputs what it read in the state. Returns error code on failure.
 Use lodepng_inspect first with a new state, then e.g. lodepng_chunk_find_const
 to find the desired chunk type, and if non null use lodepng_inspect_chunk (with
 chunk_pointer - start_of_file as pos).
@@ -1103,7 +1172,7 @@ TODO:
 [.] check compatibility with various compilers  - done but needs to be redone for every newer version
 [X] converting color to 16-bit per channel types
 [X] support color profile chunk types (but never let them touch RGB values by default)
-[ ] support all public PNG chunk types (almost done except sBIT, sPLT and hIST)
+[ ] support all public PNG chunk types (almost done except sPLT and hIST)
 [ ] make sure encoder generates no chunks with size > (2^31)-1
 [ ] partial decoding (stream processing)
 [X] let the "isFullyOpaque" function check color keys and transparent palettes too
@@ -1230,18 +1299,16 @@ The following features are supported by the decoder:
    gAMA: RGB gamma correction
    iCCP: ICC color profile
    sRGB: rendering intent
+    sBIT: significant bits

 1.2. features not supported
 ---------------------------

-The following features are _not_ supported:
+The following features are not (yet) supported:

 *) some features needed to make a conformant PNG-Editor might be still missing.
 *) partial loading/stream processing. All data must be available and is processed in one call.
-*) The following public chunks are not (yet) supported but treated as unknown chunks by LodePNG:
-    sBIT
-    hIST
-    sPLT
+*) The hIST and sPLT public chunks are not (yet) supported but treated as unknown chunks


 2. C and C++ version
@@ -1845,6 +1912,9 @@ symbol.
 Not all changes are listed here, the commit history in github lists more:
 https://github.com/lvandeve/lodepng

+*) 10 apr 2023: faster CRC32 implementation, but with larger lookup table.
+*) 13 jun 2022: added support for the sBIT chunk.
+*) 09 jan 2022: minor decoder speed improvements.
 *) 27 jun 2021: added warnings that file reading/writing functions don't support
   wide-character filenames (support for this is not planned, opening files is
   not the core part of PNG decoding/decoding and is platform dependent).
@@ -2015,5 +2085,5 @@ Domain: gmail dot com.
 Account: lode dot vandevenne.


-Copyright (c) 2005-2021 Lode Vandevenne
+Copyright (c) 2005-2022 Lode Vandevenne
 */
--- a/src/pvpngreader.cpp
+++ b/src/pvpngreader.cpp
--- a/src/pvpngreader.h
+++ b/src/pvpngreader.h
@@ -0,0 +1,48 @@
+// pngreader.h - Public Domain - see unlicense at bottom of pvpngreader.cpp
+#pragma once
+#include <stdint.h>
+
+namespace pv_png
+{
+	// PNG color types
+	enum
+	{
+		PNG_COLOR_TYPE_GREYSCALE = 0,
+		PNG_COLOR_TYPE_TRUECOLOR = 2,
+		PNG_COLOR_TYPE_PALETTIZED = 3,
+		PNG_COLOR_TYPE_GREYSCALE_ALPHA = 4,
+		PNG_COLOR_TYPE_TRUECOLOR_ALPHA = 6
+	};
+
+	// PNG file description
+	struct png_info
+	{
+		uint32_t m_width;
+		uint32_t m_height;
+				
+		uint32_t m_num_chans;	// The number of channels, factoring in transparency. Ranges from [1-4].
+
+		uint32_t m_bit_depth;	// PNG ihdr bit depth: 1, 2, 4, 8 or 16
+		uint32_t m_color_type;	// PNG ihdr color type, PNG_COLOR_TYPE_GRAYSCALE etc.
+
+		bool m_has_gamma;		// true if the PNG file had a GAMA chunk
+		uint32_t m_gamma_value; // PNG GAMA chunk value, scaled by 100000
+
+		bool m_has_trns;		// true if the PNG file used colorkey transparency
+	};
+
+	// Retrieved information about the PNG file.
+	// Returns false on any errors.
+	bool get_png_info(const void* pImage_buf, size_t buf_size, png_info& info);
+
+	// Input parameters:
+	// pImage_buf, buf_size - pointer to PNG image data
+	// desired_chans - desired number of output channels. 0=auto, 1=grayscale, 2=grayscale alpha, 3=24bpp RGB, 4=32bpp RGBA
+	//
+	// Output parameters:
+	// width, height - PNG image resolution
+	// num_chans - actual number of channels in PNG, from [1,4] (factoring in transparency)
+	//
+	// Returns nullptr on any errors.
+	void* load_png(const void* pImage_buf, size_t buf_size, uint32_t desired_chans, uint32_t &width, uint32_t &height, uint32_t& num_chans);
+}
--- a/src/qoi.h
+++ b/src/qoi.h
@@ -1,39 +1,16 @@
 /*

+Copyright (c) 2021, Dominic Szablewski - https://phoboslab.org
+SPDX-License-Identifier: MIT
+
+
 QOI - The "Quite OK Image" format for fast, lossless image compression

-Dominic Szablewski - https://phoboslab.org
-
-
-- LICENSE: The MIT License(MIT)
-
-Copyright(c) 2021 Dominic Szablewski
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files(the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions :
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-
 -- About

-QOI encodes and decodes images in a lossless format. An encoded QOI image is
-usually around 10--30% larger than a decently optimized PNG image.
-
-QOI outperforms simpler PNG encoders in compression ratio and performance. QOI
-images are typically 20% smaller than PNGs written with stbi_image. Encoding is 
-25-50x faster and decoding is 3-4x faster than stbi_image or libpng.
+QOI encodes and decodes images in a lossless format. Compared to stb_image and
+stb_image_write QOI offers 20x-50x faster encoding, 3x-4x faster decoding and
+20% better compression.


 -- Synopsis
@@ -48,7 +25,7 @@ images are typically 20% smaller than PNGs written with stbi_image. Encoding is
 // the input pixel data.
 qoi_write("image_new.qoi", rgba_pixels, &(qoi_desc){
 	.width = 1920,
-	.height = 1080, 
+	.height = 1080,
 	.channels = 4,
 	.colorspace = QOI_SRGB
 });
@@ -77,14 +54,14 @@ QOI_NO_STDIO before including this library.
 This library uses malloc() and free(). To supply your own malloc implementation
 you can define QOI_MALLOC and QOI_FREE before including this library.

-This library uses memset() to zero-initialize the index. To supply your own 
+This library uses memset() to zero-initialize the index. To supply your own
 implementation you can define QOI_ZEROARR before including this library.


 -- Data Format

-A QOI file has a 14 byte header, followed by any number of data "chunks" and 8
-zero-bytes to mark the end of the data stream.
+A QOI file has a 14 byte header, followed by any number of data "chunks" and an
+8-byte end marker.

 struct qoi_header_t {
 	char     magic[4];   // magic bytes "qoif"
@@ -94,33 +71,36 @@ struct qoi_header_t {
 	uint8_t  colorspace; // 0 = sRGB with linear alpha, 1 = all channels linear
 };

-The decoder and encoder start with {r: 0, g: 0, b: 0, a: 255} as the previous
-pixel value. Pixels are either encoded as
+Images are encoded row by row, left to right, top to bottom. The decoder and
+encoder start with {r: 0, g: 0, b: 0, a: 255} as the previous pixel value. An
+image is complete when all pixels specified by width * height have been covered.
+
+Pixels are encoded as
 - a run of the previous pixel
 - an index into an array of previously seen pixels
 - a difference to the previous pixel value in r,g,b
 - full r,g,b or r,g,b,a values

-The color channels are assumed to not be premultiplied with the alpha channel 
+The color channels are assumed to not be premultiplied with the alpha channel
 ("un-premultiplied alpha").

-A running array[64] (zero-initialized) of previously seen pixel values is 
+A running array[64] (zero-initialized) of previously seen pixel values is
 maintained by the encoder and decoder. Each pixel that is seen by the encoder
 and decoder is put into this array at the position formed by a hash function of
 the color value. In the encoder, if the pixel value at the index matches the
-current pixel, this index position is written to the stream as QOI_OP_INDEX. 
+current pixel, this index position is written to the stream as QOI_OP_INDEX.
 The hash function for the index is:

 	index_position = (r * 3 + g * 5 + b * 7 + a * 11) % 64

-Each chunk starts with a 2- or 8-bit tag, followed by a number of data bits. The 
-bit length of chunks is divisible by 8 - i.e. all chunks are byte aligned. All 
+Each chunk starts with a 2- or 8-bit tag, followed by a number of data bits. The
+bit length of chunks is divisible by 8 - i.e. all chunks are byte aligned. All
 values encoded in these data bits have the most significant bit on the left.

 The 8-bit tags have precedence over the 2-bit tags. A decoder must check for the
 presence of an 8-bit tag first.

-The byte stream is padded with 8 zero-bytes at the end.
+The byte stream's end is marked with 7 0x00 bytes followed a single 0x01 byte.


 The possible chunks are:
@@ -135,8 +115,11 @@ The possible chunks are:
 2-bit tag b00
 6-bit index into the color index array: 0..63

+A valid encoder must not issue 2 or more consecutive QOI_OP_INDEX chunks to the
+same index. QOI_OP_RUN should be used instead.

-.- QOI_OP_DIFF -----------. 
+
+.- QOI_OP_DIFF -----------.
 |         Byte[0]         |
 |  7  6  5  4  3  2  1  0 |
 |-------+-----+-----+-----|
@@ -147,14 +130,16 @@ The possible chunks are:
 2-bit green channel difference from the previous pixel between -2..1
 2-bit  blue channel difference from the previous pixel between -2..1

-The difference to the current channel values are using a wraparound operation, 
+The difference to the current channel values are using a wraparound operation,
 so "1 - 2" will result in 255, while "255 + 1" will result in 0.

-Values are stored as unsigned integers with a bias of 2. E.g. -2 is stored as 
+Values are stored as unsigned integers with a bias of 2. E.g. -2 is stored as
 0 (b00). 1 is stored as 3 (b11).

+The alpha value remains unchanged from the previous pixel.

-.- QOI_OP_LUMA -------------------------------------. 
+
+.- QOI_OP_LUMA -------------------------------------.
 |         Byte[0]         |         Byte[1]         |
 |  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |
 |-------+-----------------+-------------+-----------|
@@ -165,18 +150,20 @@ Values are stored as unsigned integers with a bias of 2. E.g. -2 is stored as
 4-bit   red channel difference minus green channel difference -8..7
 4-bit  blue channel difference minus green channel difference -8..7

-The green channel is used to indicate the general direction of change and is 
-encoded in 6 bits. The red and green channels (dr and db) base their diffs off
+The green channel is used to indicate the general direction of change and is
+encoded in 6 bits. The red and blue channels (dr and db) base their diffs off
 of the green channel difference and are encoded in 4 bits. I.e.:
-  dr_dg = (last_px.r - cur_px.r) - (last_px.g - cur_px.g)
-  db_dg = (last_px.b - cur_px.b) - (last_px.g - cur_px.g)
+	dr_dg = (cur_px.r - prev_px.r) - (cur_px.g - prev_px.g)
+	db_dg = (cur_px.b - prev_px.b) - (cur_px.g - prev_px.g)

-The difference to the current channel values are using a wraparound operation, 
+The difference to the current channel values are using a wraparound operation,
 so "10 - 13" will result in 253, while "250 + 7" will result in 1.

-Values are stored as unsigned integers with a bias of 32 for the green channel 
+Values are stored as unsigned integers with a bias of 32 for the green channel
 and a bias of 8 for the red and blue channel.

+The alpha value remains unchanged from the previous pixel.
+

 .- QOI_OP_RUN ------------.
 |         Byte[0]         |
@@ -187,8 +174,8 @@ and a bias of 8 for the red and blue channel.
 2-bit tag b11
 6-bit run-length repeating the previous pixel: 1..62

-The run-length is stored with a bias of 1. Note that the run-lengths 63 and 64 
-(b111110 and b111111) are illegal as they are occupied by the QOI_OP_RGB and 
+The run-length is stored with a bias of -1. Note that the run-lengths 63 and 64
+(b111110 and b111111) are illegal as they are occupied by the QOI_OP_RGB and
 QOI_OP_RGBA tags.


@@ -203,6 +190,8 @@ QOI_OP_RGBA tags.
 8-bit green channel value
 8-bit  blue channel value

+The alpha value remains unchanged from the previous pixel.
+

 .- QOI_OP_RGBA ---------------------------------------------------.
 |         Byte[0]         | Byte[1] | Byte[2] | Byte[3] | Byte[4] |
@@ -216,13 +205,6 @@ QOI_OP_RGBA tags.
 8-bit  blue channel value
 8-bit alpha channel value

-
-The byte stream is padded at the end with 8 zero bytes. Since the longest legal 
-chunk is 5 bytes (QOI_OP_RGBA), with this padding it is possible to check for an
-overrun only once per decode loop iteration. These 0x00 bytes also mark the end
-of the data stream, as an encoder should never produce 8 consecutive zero bytes
-within the stream.
-
 */


@@ -236,17 +218,17 @@ Header - Public functions */
 extern "C" {
 #endif

-/* A pointer to a qoi_desc struct has to be supplied to all of qoi's functions. 
-It describes either the input format (for qoi_write and qoi_encode), or is 
+/* A pointer to a qoi_desc struct has to be supplied to all of qoi's functions.
+It describes either the input format (for qoi_write and qoi_encode), or is
 filled with the description read from the file header (for qoi_read and
 qoi_decode).

-The colorspace in this qoi_desc is an enum where 
+The colorspace in this qoi_desc is an enum where
 	0 = sRGB, i.e. gamma scaled RGB channels and a linear alpha channel
 	1 = all channels are linear
-You may use the constants QOI_SRGB or QOI_LINEAR. The colorspace is purely 
+You may use the constants QOI_SRGB or QOI_LINEAR. The colorspace is purely
 informative. It will be saved to the file header, but does not affect
-en-/decoding in any way. */
+how chunks are en-/decoded. */

 #define QOI_SRGB   0
 #define QOI_LINEAR 1
@@ -260,11 +242,11 @@ typedef struct {

 #ifndef QOI_NO_STDIO

-/* Encode raw RGB or RGBA pixels into a QOI image and write it to the file 
-system. The qoi_desc struct must be filled with the image width, height, 
-number of channels (3 = RGB, 4 = RGBA) and the colorspace. 
+/* Encode raw RGB or RGBA pixels into a QOI image and write it to the file
+system. The qoi_desc struct must be filled with the image width, height,
+number of channels (3 = RGB, 4 = RGBA) and the colorspace.

-The function returns 0 on failure (invalid parameters, or fopen or malloc 
+The function returns 0 on failure (invalid parameters, or fopen or malloc
 failed) or the number of bytes written on success. */

 int qoi_write(const char *filename, const void *data, const qoi_desc *desc);
@@ -275,7 +257,7 @@ number of channels from the file header is used. If channels is 3 or 4 the
 output format will be forced into this number of channels.

 The function either returns NULL on failure (invalid data, or malloc or fopen
-failed) or a pointer to the decoded pixels. On success, the qoi_desc struct 
+failed) or a pointer to the decoded pixels. On success, the qoi_desc struct
 will be filled with the description from the file header.

 The returned pixel data should be free()d after use. */
@@ -287,8 +269,8 @@ void *qoi_read(const char *filename, qoi_desc *desc, int channels);

 /* Encode raw RGB or RGBA pixels into a QOI image in memory.

-The function either returns NULL on failure (invalid parameters or malloc 
-failed) or a pointer to the encoded data on success. On success the out_len 
+The function either returns NULL on failure (invalid parameters or malloc
+failed) or a pointer to the encoded data on success. On success the out_len
 is set to the size in bytes of the encoded data.

 The returned qoi data should be free()d after use. */
@@ -298,8 +280,8 @@ void *qoi_encode(const void *data, const qoi_desc *desc, int *out_len);

 /* Decode a QOI image from memory.

-The function either returns NULL on failure (invalid parameters or malloc 
-failed) or a pointer to the decoded pixels. On success, the qoi_desc struct 
+The function either returns NULL on failure (invalid parameters or malloc
+failed) or a pointer to the decoded pixels. On success, the qoi_desc struct
 is filled with the description from the file header.

 The returned pixel data should be free()d after use. */
@@ -342,21 +324,28 @@ Implementation */
 	(((unsigned int)'q') << 24 | ((unsigned int)'o') << 16 | \
 	 ((unsigned int)'i') <<  8 | ((unsigned int)'f'))
 #define QOI_HEADER_SIZE 14
-#define QOI_PADDING 8
+
+/* 2GB is the max file size that this implementation can safely handle. We guard
+against anything larger than that, assuming the worst case with 5 bytes per
+pixel, rounded down to a nice clean value. 400 million pixels ought to be
+enough for anybody. */
+#define QOI_PIXELS_MAX ((unsigned int)400000000)

 typedef union {
 	struct { unsigned char r, g, b, a; } rgba;
 	unsigned int v;
 } qoi_rgba_t;

-void qoi_write_32(unsigned char *bytes, int *p, unsigned int v) {
+static const unsigned char qoi_padding[8] = {0,0,0,0,0,0,0,1};
+
+static void qoi_write_32(unsigned char *bytes, int *p, unsigned int v) {
 	bytes[(*p)++] = (0xff000000 & v) >> 24;
 	bytes[(*p)++] = (0x00ff0000 & v) >> 16;
 	bytes[(*p)++] = (0x0000ff00 & v) >> 8;
 	bytes[(*p)++] = (0x000000ff & v);
 }

-unsigned int qoi_read_32(const unsigned char *bytes, int *p) {
+static unsigned int qoi_read_32(const unsigned char *bytes, int *p) {
 	unsigned int a = bytes[(*p)++];
 	unsigned int b = bytes[(*p)++];
 	unsigned int c = bytes[(*p)++];
@@ -376,14 +365,15 @@ void *qoi_encode(const void *data, const qoi_desc *desc, int *out_len) {
 		data == NULL || out_len == NULL || desc == NULL ||
 		desc->width == 0 || desc->height == 0 ||
 		desc->channels < 3 || desc->channels > 4 ||
-		desc->colorspace > 2
+		desc->colorspace > 1 ||
+		desc->height >= QOI_PIXELS_MAX / desc->width
 	) {
 		return NULL;
 	}

-	max_size = 
-		desc->width * desc->height * (desc->channels + 1) + 
-		QOI_HEADER_SIZE + QOI_PADDING;
+	max_size =
+		desc->width * desc->height * (desc->channels + 1) +
+		QOI_HEADER_SIZE + sizeof(qoi_padding);

 	p = 0;
 	bytes = (unsigned char *) QOI_MALLOC(max_size);
@@ -408,19 +398,18 @@ void *qoi_encode(const void *data, const qoi_desc *desc, int *out_len) {
 	px_prev.rgba.b = 0;
 	px_prev.rgba.a = 255;
 	px = px_prev;
-	
+
 	px_len = desc->width * desc->height * desc->channels;
 	px_end = px_len - desc->channels;
 	channels = desc->channels;

 	for (px_pos = 0; px_pos < px_len; px_pos += channels) {
+		px.rgba.r = pixels[px_pos + 0];
+		px.rgba.g = pixels[px_pos + 1];
+		px.rgba.b = pixels[px_pos + 2];
+
 		if (channels == 4) {
-			px = *(qoi_rgba_t *)(pixels + px_pos);
-		}
-		else {
-			px.rgba.r = pixels[px_pos + 0];
-			px.rgba.g = pixels[px_pos + 1];
-			px.rgba.b = pixels[px_pos + 2];
+			px.rgba.a = pixels[px_pos + 3];
 		}

 		if (px.v == px_prev.v) {
@@ -456,14 +445,14 @@ void *qoi_encode(const void *data, const qoi_desc *desc, int *out_len) {

 					if (
 						vr > -3 && vr < 2 &&
-						vg > -3 && vg < 2 && 
+						vg > -3 && vg < 2 &&
 						vb > -3 && vb < 2
 					) {
 						bytes[p++] = QOI_OP_DIFF | (vr + 2) << 4 | (vg + 2) << 2 | (vb + 2);
 					}
 					else if (
-						vg_r >  -9 && vg_r <  8 && 
-						vg   > -33 && vg   < 32 && 
+						vg_r >  -9 && vg_r <  8 &&
+						vg   > -33 && vg   < 32 &&
 						vg_b >  -9 && vg_b <  8
 					) {
 						bytes[p++] = QOI_OP_LUMA     | (vg   + 32);
@@ -488,8 +477,8 @@ void *qoi_encode(const void *data, const qoi_desc *desc, int *out_len) {
 		px_prev = px;
 	}

-	for (i = 0; i < QOI_PADDING; i++) {
-		bytes[p++] = 0;
+	for (i = 0; i < (int)sizeof(qoi_padding); i++) {
+		bytes[p++] = qoi_padding[i];
 	}

 	*out_len = p;
@@ -502,13 +491,13 @@ void *qoi_decode(const void *data, int size, qoi_desc *desc, int channels) {
 	unsigned char *pixels;
 	qoi_rgba_t index[64];
 	qoi_rgba_t px;
-	int px_len,  chunks_len, px_pos;
+	int px_len, chunks_len, px_pos;
 	int p = 0, run = 0;

 	if (
 		data == NULL || desc == NULL ||
 		(channels != 0 && channels != 3 && channels != 4) ||
-		size < QOI_HEADER_SIZE + QOI_PADDING
+		size < QOI_HEADER_SIZE + (int)sizeof(qoi_padding)
 	) {
 		return NULL;
 	}
@@ -522,10 +511,11 @@ void *qoi_decode(const void *data, int size, qoi_desc *desc, int channels) {
 	desc->colorspace = bytes[p++];

 	if (
-		desc->width == 0 || desc->height == 0 || 
+		desc->width == 0 || desc->height == 0 ||
 		desc->channels < 3 || desc->channels > 4 ||
-		desc->colorspace > 2 ||
-		header_magic != QOI_MAGIC
+		desc->colorspace > 1 ||
+		header_magic != QOI_MAGIC ||
+		desc->height >= QOI_PIXELS_MAX / desc->width
 	) {
 		return NULL;
 	}
@@ -546,7 +536,7 @@ void *qoi_decode(const void *data, int size, qoi_desc *desc, int channels) {
 	px.rgba.b = 0;
 	px.rgba.a = 255;

-	chunks_len = size - QOI_PADDING;
+	chunks_len = size - (int)sizeof(qoi_padding);
 	for (px_pos = 0; px_pos < px_len; px_pos += channels) {
 		if (run > 0) {
 			run--;
@@ -587,13 +577,12 @@ void *qoi_decode(const void *data, int size, qoi_desc *desc, int channels) {
 			index[QOI_COLOR_HASH(px) % 64] = px;
 		}

-		if (channels == 4) { 
-			*(qoi_rgba_t*)(pixels + px_pos) = px;
-		}
-		else {
-			pixels[px_pos + 0] = px.rgba.r;
-			pixels[px_pos + 1] = px.rgba.g;
-			pixels[px_pos + 2] = px.rgba.b;
+		pixels[px_pos + 0] = px.rgba.r;
+		pixels[px_pos + 1] = px.rgba.g;
+		pixels[px_pos + 2] = px.rgba.b;
+		
+		if (channels == 4) {
+			pixels[px_pos + 3] = px.rgba.a;
 		}
 	}

@@ -616,11 +605,11 @@ int qoi_write(const char *filename, const void *data, const qoi_desc *desc) {
 	if (!encoded) {
 		fclose(f);
 		return 0;
-	}	
-	
+	}
+
 	fwrite(encoded, 1, size, f);
 	fclose(f);
-	
+
 	QOI_FREE(encoded);
 	return size;
 }
@@ -636,6 +625,10 @@ void *qoi_read(const char *filename, qoi_desc *desc, int channels) {

 	fseek(f, 0, SEEK_END);
 	size = ftell(f);
+	if (size <= 0) {
+		fclose(f);
+		return NULL;
+	}
 	fseek(f, 0, SEEK_SET);

 	data = QOI_MALLOC(size);
@@ -653,4 +646,4 @@ void *qoi_read(const char *filename, qoi_desc *desc, int channels) {
 }

 #endif /* QOI_NO_STDIO */
-#endif /* QOI_IMPLEMENTATION */
+#endif /* QOI_IMPLEMENTATION */
--- a/src/stb_image.h
+++ b/src/stb_image.h
@@ -1,4 +1,4 @@
-/* stb_image - v2.27 - public domain image loader - http://nothings.org/stb
+/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb
                                  no warranty implied; use at your own risk

   Do this:
@@ -48,6 +48,7 @@ LICENSE

 RECENT REVISION HISTORY:

+      2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
      2.26  (2020-07-13) many minor fixes
      2.25  (2020-02-02) fix warnings
@@ -108,7 +109,7 @@ RECENT REVISION HISTORY:
    Cass Everitt            Ryamond Barbiero                        github:grim210
    Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
    Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
-    Josh Tobin                                 Matthew Gregan       github:poppolopoppo
+    Josh Tobin              Neil Bickford      Matthew Gregan       github:poppolopoppo
    Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
    Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
                            Brad Weinberger    Matvey Cherevko      github:mosra
@@ -140,7 +141,7 @@ RECENT REVISION HISTORY:
 //    // ... x = width, y = height, n = # 8-bit components per pixel ...
 //    // ... replace '0' with '1'..'4' to force that many components per pixel
 //    // ... but 'n' will always be the number that it would have been if you said 0
-//    stbi_image_free(data)
+//    stbi_image_free(data);
 //
 // Standard parameters:
 //    int *x                 -- outputs image width in pixels
@@ -635,7 +636,7 @@ STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
   #endif
 #endif

-#ifdef _MSC_VER
+#if defined(_MSC_VER) || defined(__SYMBIAN32__)
 typedef unsigned short stbi__uint16;
 typedef   signed short stbi__int16;
 typedef unsigned int   stbi__uint32;
@@ -1063,6 +1064,23 @@ static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
 }
 #endif

+// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow.
+static int stbi__addints_valid(int a, int b)
+{
+   if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow
+   if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product of two signed shorts is valid, 0 on overflow.
+static int stbi__mul2shorts_valid(short a, short b)
+{
+   if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
+   if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
+   if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
+   return a >= SHRT_MIN / b;
+}
+
 // stbi__err - error
 // stbi__errpf - error returning pointer to float
 // stbi__errpuc - error returning pointer to unsigned char
@@ -1985,9 +2003,12 @@ static int stbi__build_huffman(stbi__huffman *h, int *count)
   int i,j,k=0;
   unsigned int code;
   // build size list for each symbol (from JPEG spec)
-   for (i=0; i < 16; ++i)
-      for (j=0; j < count[i]; ++j)
+   for (i=0; i < 16; ++i) {
+      for (j=0; j < count[i]; ++j) {
         h->size[k++] = (stbi_uc) (i+1);
+         if(k >= 257) return stbi__err("bad size list","Corrupt JPEG");
+      }
+   }
   h->size[k] = 0;

   // compute actual symbols (from jpeg spec)
@@ -2112,6 +2133,8 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)

   // convert the huffman code to the symbol id
   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+   if(c < 0 || c >= 256) // symbol id out of bounds!
+       return -1;
   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);

   // convert the id to a symbol
@@ -2130,6 +2153,7 @@ stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
   unsigned int k;
   int sgn;
   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing

   sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
   k = stbi_lrot(j->code_buffer, n);
@@ -2144,6 +2168,7 @@ stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
 {
   unsigned int k;
   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
   k = stbi_lrot(j->code_buffer, n);
   j->code_buffer = k & ~stbi__bmask[n];
   k &= stbi__bmask[n];
@@ -2155,6 +2180,7 @@ stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
 {
   unsigned int k;
   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing
   k = j->code_buffer;
   j->code_buffer <<= 1;
   --j->code_bits;
@@ -2192,8 +2218,10 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman
   memset(data,0,64*sizeof(data[0]));

   diff = t ? stbi__extend_receive(j, t) : 0;
+   if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG");
   dc = j->img_comp[b].dc_pred + diff;
   j->img_comp[b].dc_pred = dc;
+   if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
   data[0] = (short) (dc * dequant[0]);

   // decode AC components, see JPEG spec
@@ -2207,6 +2235,7 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman
      if (r) { // fast-AC path
         k += (r >> 4) & 15; // run
         s = r & 15; // combined length
+         if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
         j->code_buffer <<= s;
         j->code_bits -= s;
         // decode into unzigzag'd location
@@ -2246,8 +2275,10 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__
      if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
      diff = t ? stbi__extend_receive(j, t) : 0;

+      if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG");
      dc = j->img_comp[b].dc_pred + diff;
      j->img_comp[b].dc_pred = dc;
+      if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
      data[0] = (short) (dc * (1 << j->succ_low));
   } else {
      // refinement scan for DC coefficient
@@ -2282,6 +2313,7 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__
         if (r) { // fast-AC path
            k += (r >> 4) & 15; // run
            s = r & 15; // combined length
+            if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
            j->code_buffer <<= s;
            j->code_bits -= s;
            zig = stbi__jpeg_dezigzag[k++];
@@ -3102,6 +3134,7 @@ static int stbi__process_marker(stbi__jpeg *z, int m)
               sizes[i] = stbi__get8(z->s);
               n += sizes[i];
            }
+            if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values!
            L -= 17;
            if (tc == 0) {
               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
@@ -3351,6 +3384,28 @@ static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
   return 1;
 }

+static int stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
+{
+   // some JPEGs have junk at end, skip over it but if we find what looks
+   // like a valid marker, resume there
+   while (!stbi__at_eof(j->s)) {
+      int x = stbi__get8(j->s);
+      while (x == 255) { // might be a marker
+         if (stbi__at_eof(j->s)) return STBI__MARKER_none;
+         x = stbi__get8(j->s);
+         if (x != 0x00 && x != 0xff) {
+            // not a stuffed zero or lead-in to another marker, looks
+            // like an actual marker, return it
+            return x;
+         }
+         // stuffed zero has x=0 now which ends the loop, meaning we go
+         // back to regular scan loop.
+         // repeated 0xff keeps trying to read the next byte of the marker.
+      }
+   }
+   return STBI__MARKER_none;
+}
+
 // decode image to YCbCr format
 static int stbi__decode_jpeg_image(stbi__jpeg *j)
 {
@@ -3367,25 +3422,22 @@ static int stbi__decode_jpeg_image(stbi__jpeg *j)
         if (!stbi__process_scan_header(j)) return 0;
         if (!stbi__parse_entropy_coded_data(j)) return 0;
         if (j->marker == STBI__MARKER_none ) {
-            // handle 0s at the end of image data from IP Kamera 9060
-            while (!stbi__at_eof(j->s)) {
-               int x = stbi__get8(j->s);
-               if (x == 255) {
-                  j->marker = stbi__get8(j->s);
-                  break;
-               }
-            }
+         j->marker = stbi__skip_jpeg_junk_at_end(j);
            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
         }
+         m = stbi__get_marker(j);
+         if (STBI__RESTART(m))
+            m = stbi__get_marker(j);
      } else if (stbi__DNL(m)) {
         int Ld = stbi__get16be(j->s);
         stbi__uint32 NL = stbi__get16be(j->s);
         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
+         m = stbi__get_marker(j);
      } else {
-         if (!stbi__process_marker(j, m)) return 0;
+         if (!stbi__process_marker(j, m)) return 1;
+         m = stbi__get_marker(j);
      }
-      m = stbi__get_marker(j);
   }
   if (j->progressive)
      stbi__jpeg_finish(j);
@@ -3976,6 +4028,7 @@ static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int re
   unsigned char* result;
   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
   if (!j) return stbi__errpuc("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
   STBI_NOTUSED(ri);
   j->s = s;
   stbi__setup_jpeg(j);
@@ -3989,6 +4042,7 @@ static int stbi__jpeg_test(stbi__context *s)
   int r;
   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
   j->s = s;
   stbi__setup_jpeg(j);
   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
@@ -4014,6 +4068,7 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
   int result;
   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
   j->s = s;
   result = stbi__jpeg_info_raw(j, x, y, comp);
   STBI_FREE(j);
@@ -4256,11 +4311,12 @@ static int stbi__parse_huffman_block(stbi__zbuf *a)
            a->zout = zout;
            return 1;
         }
+         if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
         z -= 257;
         len = stbi__zlength_base[z];
         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
         z = stbi__zhuffman_decode(a, &a->z_distance);
-         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG");
+         if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
         dist = stbi__zdist_base[z];
         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
@@ -4955,7 +5011,7 @@ STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
 static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
 static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;

-STBIDEF void stbi__unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
 {
   stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
   stbi__unpremultiply_on_load_set = 1;
@@ -5064,14 +5120,13 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
            if (!pal_img_n) {
               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
-               if (scan == STBI__SCAN_header) return 1;
            } else {
               // if paletted, then pal_n is our final components, and
               // img_n is # components to decompress/filter.
               s->img_n = 1;
               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
-               // if SCAN_header, have to scan to see if we have a tRNS
            }
+            // even with SCAN_header, have to scan to see if we have a tRNS
            break;
         }

@@ -5103,6 +5158,8 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
               has_trans = 1;
+               // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
+               if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
               if (z->depth == 16) {
                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
               } else {
@@ -5115,7 +5172,13 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
         case STBI__PNG_TYPE('I','D','A','T'): {
            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
-            if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
+            if (scan == STBI__SCAN_header) {
+               // header scan definitely stops at first IDAT
+               if (pal_img_n)
+                  s->img_n = pal_img_n;
+               return 1;
+            }
+            if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
            if ((int)(ioff + c.length) < (int)ioff) return 0;
            if (ioff + c.length > idata_limit) {
               stbi__uint32 idata_limit_old = idata_limit;
@@ -5498,8 +5561,22 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req
         psize = (info.offset - info.extra_read - info.hsz) >> 2;
   }
   if (psize == 0) {
-      if (info.offset != s->callback_already_read + (s->img_buffer - s->img_buffer_original)) {
-        return stbi__errpuc("bad offset", "Corrupt BMP");
+      // accept some number of extra bytes after the header, but if the offset points either to before
+      // the header ends or implies a large amount of extra data, reject the file as malformed
+      int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
+      int header_limit = 1024; // max we actually read is below 256 bytes currently.
+      int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
+      if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
+         return stbi__errpuc("bad header", "Corrupt BMP");
+      }
+      // we established that bytes_read_so_far is positive and sensible.
+      // the first half of this test rejects offsets that are either too small positives, or
+      // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
+      // ensures the number computed in the second half of the test can't overflow.
+      if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
+         return stbi__errpuc("bad offset", "Corrupt BMP");
+      } else {
+         stbi__skip(s, info.offset - bytes_read_so_far);
      }
   }

@@ -7187,12 +7264,12 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
                  // Run
                  value = stbi__get8(s);
                  count -= 128;
-                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
                  for (z = 0; z < count; ++z)
                     scanline[i++ * 4 + k] = value;
               } else {
                  // Dump
-                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
                  for (z = 0; z < count; ++z)
                     scanline[i++ * 4 + k] = stbi__get8(s);
               }
@@ -7446,10 +7523,17 @@ static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req

   out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
   if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8));
+   if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
+      STBI_FREE(out);
+      return stbi__errpuc("bad PNM", "PNM file truncated");
+   }

   if (req_comp && req_comp != s->img_n) {
-      out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      if (ri->bits_per_channel == 16) {
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y);
+      } else {
+         out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      }
      if (out == NULL) return out; // stbi__convert_format frees input on failure
   }
   return out;
@@ -7486,6 +7570,8 @@ static int      stbi__pnm_getinteger(stbi__context *s, char *c)
   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
      value = value*10 + (*c - '0');
      *c = (char) stbi__get8(s);
+      if((value > 214748364) || (value == 214748364 && *c > '7'))
+          return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
   }

   return value;
@@ -7516,9 +7602,13 @@ static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
   stbi__pnm_skip_whitespace(s, &c);

   *x = stbi__pnm_getinteger(s, &c); // read width
+   if(*x == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
   stbi__pnm_skip_whitespace(s, &c);

   *y = stbi__pnm_getinteger(s, &c); // read height
+   if (*y == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
   stbi__pnm_skip_whitespace(s, &c);

   maxv = stbi__pnm_getinteger(s, &c);  // read max value
@@ -7894,4 +7984,4 @@ AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ------------------------------------------------------------------------------
-*/
+*/
--- a/src/wuffs-v0.3.c
+++ b/src/wuffs-v0.3.c
Author	SHA1	Message	Date
josh	f90c16e34b	Implement CPM support	2024-01-10 16:23:09 -05:00
josh	b6046e67dd	Implement CPM support	2024-01-10 16:13:30 -05:00
josh	df2a649cd3	Implement CPM support	2024-01-10 16:01:47 -05:00
josh	3475cb379c	Implement CPM support	2024-01-10 15:50:34 -05:00
josh	9f2ddd5f76	Implement CPM support	2024-01-10 15:43:15 -05:00
josh	1148e5c916	Update CMakeLists.txt	2024-01-10 15:25:22 -05:00
Rich Geldreich	7298d34590	Update README.md	2023-12-05 01:27:30 -05:00
Rich Geldreich	163d5714a0	Update README.md	2023-04-20 16:48:44 -04:00
Rich Geldreich	1039f5aa9c	Update README.md	2023-04-20 16:47:54 -04:00
Rich Geldreich	b49d1d4d5a	Update README.md	2023-04-20 16:46:09 -04:00
Rich Geldreich	da6991f598	Update README.md	2023-04-20 16:45:04 -04:00
Rich Geldreich	970d9290eb	Update README.md	2023-04-20 16:44:26 -04:00
Richard Geldreich	6ed6544cc9	Changing the # of times to encode to 3, instead of 1, when CSV files aren't being generated.	2023-04-20 16:40:25 -04:00
Richard Geldreich	a77506cd2f	Upgrading lodepng, QOI, and stb_image.h/stb_image_write.h, and adding the pvpng reader from basisu for benchmarking/comparison purposes	2023-04-20 16:21:37 -04:00
Richard Geldreich	a2b4e1bdf6	new files	2023-04-20 16:20:48 -04:00
Rich Geldreich	357d3a6b73	Update README.md	2023-03-30 20:01:28 -04:00
Rich Geldreich	6926f5a0a7	Update README.md	2022-01-11 16:01:37 -05:00
Rich Geldreich	75c8f930ef	Update README.md	2022-01-10 14:56:56 -05:00
Rich Geldreich	a05746ac46	Update README.md	2022-01-10 14:55:02 -05:00
Richard Geldreich	e3834907b9	Merge remote-tracking branch 'origin/main'	2022-01-10 14:54:17 -05:00
Richard Geldreich	645d49cf6b	Disabling Wuff's CRC checking Minor printf() fixes	2022-01-10 14:53:52 -05:00
Rich Geldreich	85a734e9f2	Update README.md	2022-01-10 14:48:40 -05:00
Rich Geldreich	6fea592540	Update README.md	2022-01-05 19:39:26 -05:00
Rich Geldreich	32d7f3a388	Update README.md	2022-01-05 17:53:36 -05:00
Rich Geldreich	01296df391	Bumping version to 1.0.6	2022-01-05 17:14:20 -05:00
Rich Geldreich	a80ccdd937	New file	2022-01-05 17:14:20 -05:00
Rich Geldreich	c8ea38f3ee	Update README.md	2022-01-05 17:11:43 -05:00
Rich Geldreich	3bd9c4dbda	Adding workaround to the encoder to work around a bug in wuff's distance table decoder Adding -t Huffman table training option to test harness - set FPNG_TRAIN_HUFFMAN_TABLES to 1 to use it Re-trained the single pass mode's Huffman tables	2022-01-05 17:08:49 -05:00
Rich Geldreich	42303b97e2	Update README.md	2022-01-05 08:34:25 -05:00
Rich Geldreich	f813c4dfdb	fixing cpuid code so it's only compiled on x86	2022-01-04 16:59:30 -05:00
Rich Geldreich	49f3505062	Update README.md	2022-01-02 13:18:01 -05:00
Rich Geldreich	1c9d03942c	Fixing comment	2021-12-31 19:22:33 -05:00
Rich Geldreich	66b531956b	Update README.md	2021-12-31 18:54:48 -05:00
Rich Geldreich	c471ebef9d	Changing SSE adler32 to iterate 16 bytes at a time vs. 8. Changing adler32 function types to match the crc32 function.	2021-12-31 18:52:55 -05:00
Rich Geldreich	c83e17b38a	Adding SSE to the filter code, for another 10-15% compression perf gain	2021-12-31 18:22:36 -05:00
Rich Geldreich	3a3f22f968	fixing comment	2021-12-31 16:46:09 -05:00
Rich Geldreich	583b75c986	Fixing check here so it uses a uint64_t multiply	2021-12-31 16:41:33 -05:00
Rich Geldreich	b864f3324f	Fixing typo in remark	2021-12-31 14:12:32 -05:00