|
|
|
@ -166,14 +166,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
// Fast power-of-two texture wrap (SW_REPEAT mode only)
|
|
|
|
// Fast power-of-two texture wrap (SW_REPEAT mode only)
|
|
|
|
// When defined, textures whose width/height are powers of two use a bitmask
|
|
|
|
// When defined, textures whose width/height are powers of two use a bitmask
|
|
|
|
// wrap (`x & (size-1)`) instead of `floorf`-based fractional wrap or the
|
|
|
|
// wrap (`x & (size-1)`) instead of `floorf`-based fractional wrap or the signed `%` chain in the linear sampler
|
|
|
|
// signed `%` chain in the linear sampler. Saves a software divide on Xtensa
|
|
|
|
// Saves a software divide on Xtensa and a few instructions everywhere
|
|
|
|
// and a few instructions everywhere. NPOT textures keep using the original
|
|
|
|
// NPOT textures keep using the original path via a runtime `(size & (size-1)) == 0` check,
|
|
|
|
// path via a runtime `(size & (size-1)) == 0` check, so SW_REPEAT remains
|
|
|
|
// so SW_REPEAT remains correct for them
|
|
|
|
// correct for them. The only observable behavior change is for POT textures
|
|
|
|
// The only observable behavior change is for POT textures sampled with negative UV coordinates:
|
|
|
|
// sampled with negative UV coordinates: bitmask wrap (two's complement) can
|
|
|
|
// bitmask wrap (two's complement) can differ from `sw_fract` by one texel
|
|
|
|
// differ from `sw_fract` by one texel. Off by default to keep bit-for-bit
|
|
|
|
// Off by default to keep bit-for-bit behavior; opt in if you control your asset UVs
|
|
|
|
// behavior; opt in if you control your asset UVs.
|
|
|
|
|
|
|
|
//
|
|
|
|
//
|
|
|
|
//#define SW_TEXTURE_REPEAT_POT_FAST
|
|
|
|
//#define SW_TEXTURE_REPEAT_POT_FAST
|
|
|
|
|
|
|
|
|
|
|
|
@ -860,11 +859,9 @@ SWAPI void swGetFramebufferAttachmentParameteriv(SWattachment attachment, SWatta
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// ESP-DSP acceleration: ESP-IDF ships an optimized math library that includes
|
|
|
|
// ESP-DSP acceleration: ESP-IDF ships an optimized math library that includes
|
|
|
|
// `dspm_mult_4x4x4_f32` (4x4 matrix multiply) and `dspm_mult_4x4x1_f32`
|
|
|
|
// `dspm_mult_4x4x4_f32` (4x4 matrix multiply) and `dspm_mult_4x4x1_f32` (matrix * vector)
|
|
|
|
// (matrix * vector). These are S3-tuned hand-vectorized kernels that beat the
|
|
|
|
// These are S3-tuned hand-vectorized kernels that beat the scalar versions for both throughput and code-size
|
|
|
|
// scalar versions for both throughput and code-size. Detection is opt-in to
|
|
|
|
// Detection is opt-in to keep the dependency optional: define SW_USE_ESP_DSP from your build system
|
|
|
|
// keep the dependency optional: define SW_USE_ESP_DSP from your build system
|
|
|
|
|
|
|
|
// (or rely on the `idf_component.yml` example shown in the rlsw docs).
|
|
|
|
|
|
|
|
#if defined(ESP_PLATFORM) && defined(SW_USE_ESP_DSP)
|
|
|
|
#if defined(ESP_PLATFORM) && defined(SW_USE_ESP_DSP)
|
|
|
|
#define SW_HAS_ESP_DSP
|
|
|
|
#define SW_HAS_ESP_DSP
|
|
|
|
#include "dspm_mult.h"
|
|
|
|
#include "dspm_mult.h"
|
|
|
|
@ -884,10 +881,10 @@ SWAPI void swGetFramebufferAttachmentParameteriv(SWattachment attachment, SWatta
|
|
|
|
#define SW_DEG2RAD (SW_PI/180.0f)
|
|
|
|
#define SW_DEG2RAD (SW_PI/180.0f)
|
|
|
|
#define SW_RAD2DEG (180.0f/SW_PI)
|
|
|
|
#define SW_RAD2DEG (180.0f/SW_PI)
|
|
|
|
|
|
|
|
|
|
|
|
// When clipping a convex polygon against a plane, at most one vertex is added.
|
|
|
|
// When clipping a convex polygon against a plane, at most one vertex is added
|
|
|
|
// Starting from a quadrilateral (4 vertices), clipped sequentially against
|
|
|
|
// Starting from a quadrilateral (4 vertices), clipped sequentially against
|
|
|
|
// the frustum (6 planes) then the scissor rectangle (4 planes):
|
|
|
|
// the frustum (6 planes) then the scissor rectangle (4 planes):
|
|
|
|
// 4 + 6 + 4 = 14 vertices maximum.
|
|
|
|
// 4 + 6 + 4 = 14 vertices maximum
|
|
|
|
#define SW_MAX_CLIPPED_POLYGON_VERTICES 14
|
|
|
|
#define SW_MAX_CLIPPED_POLYGON_VERTICES 14
|
|
|
|
#define SW_CLIP_EPSILON 1e-4f
|
|
|
|
#define SW_CLIP_EPSILON 1e-4f
|
|
|
|
|
|
|
|
|
|
|
|
@ -1175,7 +1172,7 @@ static inline void sw_matrix_mul_rst(float *SW_RESTRICT dst, const float *SW_RES
|
|
|
|
// column-major, so passing them flat is equivalent to passing transposes:
|
|
|
|
// column-major, so passing them flat is equivalent to passing transposes:
|
|
|
|
// dspm_mult(L^T, R^T) computes (L^T)*(R^T) = (R*L)^T, written back into a
|
|
|
|
// dspm_mult(L^T, R^T) computes (L^T)*(R^T) = (R*L)^T, written back into a
|
|
|
|
// flat array gives the same bit pattern as the column-major product (R*L)
|
|
|
|
// flat array gives the same bit pattern as the column-major product (R*L)
|
|
|
|
// -- exactly the semantic the scalar fallback below has.
|
|
|
|
// -- exactly the semantic the scalar fallback below has
|
|
|
|
dspm_mult_4x4x4_f32(left, right, dst);
|
|
|
|
dspm_mult_4x4x4_f32(left, right, dst);
|
|
|
|
#else
|
|
|
|
#else
|
|
|
|
float l00 = left[0], l01 = left[1], l02 = left[2], l03 = left[3];
|
|
|
|
float l00 = left[0], l01 = left[1], l02 = left[2], l03 = left[3];
|
|
|
|
@ -1248,12 +1245,12 @@ static inline float sw_fract(float x)
|
|
|
|
return (x - floorf(x));
|
|
|
|
return (x - floorf(x));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Fast reciprocal: 1-ULP accurate in ~7 instructions on Xtensa using the
|
|
|
|
// Xtensa architecture optimization
|
|
|
|
// hardware `recip0.s` seed + two Newton-Raphson refinement steps. All work
|
|
|
|
// Fast reciprocal: 1-ULP accurate in ~7 instructions using the
|
|
|
|
// stays in FPU registers — no `__divsf3` software call. Hot-path divisions
|
|
|
|
// hardware `recip0.s` seed + two Newton-Raphson refinement steps
|
|
|
|
// in the rasterizer (span/triangle setup, perspective divide, etc.) call
|
|
|
|
// All work stays in FPU registers — no `__divsf3` software call
|
|
|
|
// this. On non-Xtensa targets it transparently expands to `1.0f / x`, so
|
|
|
|
// Hot-path divisions in the rasterizer (span/triangle setup, perspective divide, etc.) call this
|
|
|
|
// generated code is identical to before.
|
|
|
|
// On non-Xtensa targets it transparently expands to `1.0f / x`, so generated code is identical to before
|
|
|
|
#if defined(__XTENSA__)
|
|
|
|
#if defined(__XTENSA__)
|
|
|
|
__attribute__((always_inline))
|
|
|
|
__attribute__((always_inline))
|
|
|
|
static inline float sw_rcp(float x)
|
|
|
|
static inline float sw_rcp(float x)
|
|
|
|
@ -3558,8 +3555,8 @@ static inline bool sw_quad_face_culling(void)
|
|
|
|
// winding in the projected space when all w > 0
|
|
|
|
// winding in the projected space when all w > 0
|
|
|
|
// A value of 0 for sgnArea means P0, P1, P2 are collinear in (x, y, w)
|
|
|
|
// A value of 0 for sgnArea means P0, P1, P2 are collinear in (x, y, w)
|
|
|
|
// space, which corresponds to a degenerate triangle projection
|
|
|
|
// space, which corresponds to a degenerate triangle projection
|
|
|
|
// Such quads might also be degenerate or non-planar. They are typically
|
|
|
|
// Such quads might also be degenerate or non-planar
|
|
|
|
// not culled by this test (0 < 0 is false, 0 > 0 is false)
|
|
|
|
// They are typically not culled by this test (0 < 0 is false, 0 > 0 is false)
|
|
|
|
// and should be handled by the clipper if necessary
|
|
|
|
// and should be handled by the clipper if necessary
|
|
|
|
|
|
|
|
|
|
|
|
return (RLSW.cullFace == SW_FRONT)? (sgnArea < 0.0f) : (sgnArea > 0.0f); // Cull if winding is "clockwise" : "counter-clockwise"
|
|
|
|
return (RLSW.cullFace == SW_FRONT)? (sgnArea < 0.0f) : (sgnArea > 0.0f); // Cull if winding is "clockwise" : "counter-clockwise"
|
|
|
|
@ -3879,8 +3876,7 @@ static inline void sw_poly_fill_render(uint32_t state)
|
|
|
|
//-------------------------------------------------------------------------------------------
|
|
|
|
//-------------------------------------------------------------------------------------------
|
|
|
|
static void sw_immediate_begin(SWdraw mode)
|
|
|
|
static void sw_immediate_begin(SWdraw mode)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
// NOTE: Any checks to ensure command recording can start
|
|
|
|
// NOTE: Any checks to ensure command recording can start must be performed before calling this function
|
|
|
|
// must be performed before calling this function.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Recalculate the MVP if this is needed
|
|
|
|
// Recalculate the MVP if this is needed
|
|
|
|
if (RLSW.isDirtyMVP)
|
|
|
|
if (RLSW.isDirtyMVP)
|
|
|
|
@ -3891,8 +3887,8 @@ static void sw_immediate_begin(SWdraw mode)
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef SW_HAS_ESP_DSP
|
|
|
|
#ifdef SW_HAS_ESP_DSP
|
|
|
|
// Pre-transpose to row-major so dspm_mult_4x4x1_f32(matMVP_rm, v, out)
|
|
|
|
// Pre-transpose to row-major so dspm_mult_4x4x1_f32(matMVP_rm, v, out)
|
|
|
|
// computes M*v directly in the per-vertex hot path. 16 scalar copies
|
|
|
|
// computes M*v directly in the per-vertex hot path; 16 scalar copies
|
|
|
|
// per MVP update vs. saving ~20 cycles per vertex transform.
|
|
|
|
// per MVP update vs saving ~20 cycles per vertex transform
|
|
|
|
for (int i = 0; i < 4; i++)
|
|
|
|
for (int i = 0; i < 4; i++)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
for (int j = 0; j < 4; j++)
|
|
|
|
for (int j = 0; j < 4; j++)
|
|
|
|
@ -3955,7 +3951,7 @@ static void sw_immediate_push_vertex(const float position[4])
|
|
|
|
// Calculate clip coordinates
|
|
|
|
// Calculate clip coordinates
|
|
|
|
#ifdef SW_HAS_ESP_DSP
|
|
|
|
#ifdef SW_HAS_ESP_DSP
|
|
|
|
// dspm_mult_4x4x1_f32 declares its inputs non-const; rlsw treats them as
|
|
|
|
// dspm_mult_4x4x1_f32 declares its inputs non-const; rlsw treats them as
|
|
|
|
// read-only and the cast is safe (the kernel only loads from B).
|
|
|
|
// read-only and the cast is safe (the kernel only loads from B)
|
|
|
|
dspm_mult_4x4x1_f32(RLSW.matMVP_rm, (float *)position, vertex->position);
|
|
|
|
dspm_mult_4x4x1_f32(RLSW.matMVP_rm, (float *)position, vertex->position);
|
|
|
|
#else
|
|
|
|
#else
|
|
|
|
const float *m = RLSW.matMVP;
|
|
|
|
const float *m = RLSW.matMVP;
|
|
|
|
@ -5567,7 +5563,7 @@ static void SW_RASTER_TRIANGLE(const sw_vertex_t *v0, const sw_vertex_t *v1, con
|
|
|
|
if (v0->position[1] > v1->position[1]) { const sw_vertex_t *tmp = v0; v0 = v1; v1 = tmp; }
|
|
|
|
if (v0->position[1] > v1->position[1]) { const sw_vertex_t *tmp = v0; v0 = v1; v1 = tmp; }
|
|
|
|
|
|
|
|
|
|
|
|
// Extracting coordinates from the sorted vertices
|
|
|
|
// Extracting coordinates from the sorted vertices
|
|
|
|
// Put x away for safe keeping. Only y is used right now. Silences warnings.
|
|
|
|
// Put x away for safe keeping; only y is used right now; silences warnings
|
|
|
|
float y0 = v0->position[1];
|
|
|
|
float y0 = v0->position[1];
|
|
|
|
float y1 = v1->position[1];
|
|
|
|
float y1 = v1->position[1];
|
|
|
|
float y2 = v2->position[1];
|
|
|
|
float y2 = v2->position[1];
|
|
|
|
|