Faster blend functions
diff --git a/32blit/graphics/blend.cpp b/32blit/graphics/blend.cpp index 8d10ff0..c9e2aec 100644 --- a/32blit/graphics/blend.cpp +++ b/32blit/graphics/blend.cpp
@@ -26,15 +26,126 @@ return d + ((a * (s - d) + 127) >> 8); } + __attribute__((always_inline)) inline void blend_rgba_rgb(const Pen *s, uint8_t *d, const uint8_t &a, uint32_t c) { + if (c == 1) { + // fast case for single pixel draw + *d = blend(s->r, *d, a); d++; + *d = blend(s->g, *d, a); d++; + *d = blend(s->b, *d, a); d++; + return; + } + + if (c <= 4) { + // fast case for small number of pixels + while (c--) { + *d = blend(s->r, *d, a); d++; + *d = blend(s->g, *d, a); d++; + *d = blend(s->b, *d, a); d++; + } + return; + } + + // create packed 32bit source + // s32 now contains RGBA + uint32_t s32 = *((uint32_t*)(s)); + // replace A with R so s32 is now RGBR + s32 = (s32 & 0x00ffffff) | ((s32 & 0x000000ff) << 24); + + // if destination is not double-word aligned copy at most three bytes until it is + uint8_t* de = d + c * 3; + while (uint32_t(d) & 0b11) { + *d = blend((s32 & 0xff), *d, a); d++; + // rotate the aligned rgbr/gbrg/brgb quad + s32 >>= 8; s32 |= uint8_t(s32 & 0xff) << 24; + } + + // destination is now double-word aligned + if (d < de) { + // get a double-word aligned pointer to the destination surface + uint32_t *d32 = (uint32_t*)d; + + // copy four bytes at a time until we have fewer than four bytes remaining + uint32_t c32 = uint32_t(de - d) >> 2; + while (c32--) { + uint32_t dd32 = *d32; + + *d32++ = blend((s32 & 0xff), (dd32 & 0xff), a) | + (blend((s32 & 0xff00) >> 8, (dd32 & 0xff00) >> 8, a) << 8) | + (blend((s32 & 0xff0000) >> 16, (dd32 & 0xff0000) >> 16, a) << 16) | + (blend((s32 & 0xff000000) >> 24, (dd32 & 0xff000000) >> 24, a) << 24); + + // rotate the aligned rgbr/gbrg/brgb quad + s32 >>= 8; s32 |= uint8_t(s32 & 0xff) << 24; + } + + // copy the trailing bytes as needed + d = (uint8_t*)de; + while (d < de) { + *d = blend((s32 & 0xff), *d, a); s32 >>= 8; d++; + } + } + } + + __attribute__((always_inline)) inline void copy_rgba_rgb(const Pen* s, uint8_t *d, uint32_t c) { + if (c == 1) { + // fast case for single pixel draw + *(d + 0) = s->r; *(d + 1) = s->g; *(d + 2) = s->b; + return; + } + + if (c <= 4) { + // fast case for small number of pixels + do { + *(d + 0) = s->r; *(d + 1) = s->g; *(d + 2) = s->b; d += 3; + } while (--c); + return; + } + + // create packed 32bit source + // s32 now contains RGBA + uint32_t s32 = *((uint32_t*)(s)); + // replace A with R so s32 is now RGBR + s32 = (s32 & 0x00ffffff) | ((s32 & 0x000000ff) << 24); + + // if destination is not double-word aligned copy at most three bytes until it is + uint8_t* de = d + c * 3; + while (uint32_t(d) & 0b11) { + *d = s32 & 0xff000000; d++; + // rotate the aligned rgbr/gbrg/brgb quad + s32 >>= 8; s32 |= uint8_t(s32 & 0xff) << 24; + } + + // destination is now double-word aligned + if (d < de) { + // get a double-word aligned pointer to the destination surface + uint32_t *d32 = (uint32_t*)d; + + // copy four bytes at a time until we have fewer than four bytes remaining + uint32_t c32 = uint32_t(de - d) >> 2; + while (c32--) { + *d32++ = s32; + // rotate the aligned rgbr/gbrg/brgb quad + s32 >>= 8; s32 |= uint8_t(s32 & 0xff) << 24; + } + + // copy the trailing bytes as needed + d = (uint8_t*)de; + while (d < de) { + *d = (s32 & 0xff); s32 >>= 8; d++; + } + } + } + void RGBA_RGBA(const Pen* pen, const Surface* dest, uint32_t off, uint32_t cnt) { uint8_t* d = dest->data + (off * 4); uint8_t* m = dest->mask ? dest->mask->data + off : nullptr; + uint16_t a1 = alpha(pen->a, dest->alpha); do { - uint16_t a = m ? alpha(pen->a, *m++, dest->alpha) : alpha(pen->a, dest->alpha); + uint16_t a = m ? alpha(a1, *m++) : a1; if (a >= 255) { - *d++ = pen->r; *d++ = pen->g; *d++ = pen->b; d++; + *d++ = pen->r; *d++ = pen->g; *d++ = pen->b; *d++ = 255; } else if (a > 0) { *d = blend(pen->r, *d, a); d++; *d = blend(pen->g, *d, a); d++; @@ -46,23 +157,29 @@ } while (--cnt); } - void RGBA_RGB(const Pen* pen, const Surface* dest, uint32_t off, uint32_t cnt) { + void RGBA_RGB(const Pen* pen, const Surface* dest, uint32_t off, uint32_t c) { uint8_t* d = dest->data + (off * 3); uint8_t* m = dest->mask ? dest->mask->data + off : nullptr; - do { - uint16_t a = m ? alpha(pen->a, *m++, dest->alpha) : alpha(pen->a, dest->alpha); - + uint16_t a = alpha(pen->a, dest->alpha); + if (!m) { + // no mask if (a >= 255) { - *d++ = pen->r; *d++ = pen->g; *d++ = pen->b; - } else if (a > 0) { - *d = blend(pen->r, *d, a); d++; - *d = blend(pen->g, *d, a); d++; - *d = blend(pen->b, *d, a); d++; - }else{ + // no alpha, just copy + copy_rgba_rgb(pen, d, c); + } + else { + // alpha, blend + blend_rgba_rgb(pen, d, a, c); + } + } else { + // mask enabled, slow blend + do { + uint16_t ma = alpha(a, *m++); + blend_rgba_rgb(pen, d, ma, 1); d += 3; - } - } while (--cnt); + } while (--c); + } } void P_P(const Pen* pen, const Surface* dest, uint32_t off, uint32_t cnt) {