Implement unaligned word copy.

This commit is contained in:
Reinhard Panhuber 2021-03-25 14:28:59 +01:00
parent 1e4e87de51
commit bfddfbadc7
1 changed files with 62 additions and 46 deletions

View File

@ -107,30 +107,33 @@ static inline uint16_t _ff_mod(uint16_t idx, uint16_t depth)
// TODO generalize with configurable 1 byte or 4 byte each read // TODO generalize with configurable 1 byte or 4 byte each read
static void _tu_fifo_read_from_const_src_ptr_in_full_words(void * dst, const void * src, uint16_t len) static void _tu_fifo_read_from_const_src_ptr_in_full_words(void * dst, const void * src, uint16_t len)
{ {
uint8_t * dst_u8 = (uint8_t *)dst;
volatile uint32_t * rx_fifo = (volatile uint32_t *) src; volatile uint32_t * rx_fifo = (volatile uint32_t *) src;
// Optimize for fast word copies
typedef struct{
uint32_t val;
} __attribute((__packed__)) unaligned_uint32_t;
unaligned_uint32_t* dst_una = (unaligned_uint32_t*)dst;
// Reading full available 32 bit words from FIFO // Reading full available 32 bit words from FIFO
uint16_t full_words = len >> 2; uint16_t full_words = len >> 2;
for(uint16_t i = 0; i < full_words; i++) { while(full_words--)
uint32_t tmp = *rx_fifo; {
dst_u8[0] = tmp & 0x000000FF; dst_una->val = *rx_fifo;
dst_u8[1] = (tmp & 0x0000FF00) >> 8; dst_una++;
dst_u8[2] = (tmp & 0x00FF0000) >> 16;
dst_u8[3] = (tmp & 0xFF000000) >> 24;
dst_u8 += 4;
} }
// Read the remaining 1-3 bytes from FIFO // Read the remaining 1-3 bytes from FIFO
uint8_t bytes_rem = len & 0x03; uint8_t bytes_rem = len & 0x03;
if(bytes_rem != 0) { if(bytes_rem != 0) {
uint8_t * dst_u8 = (uint8_t *)dst_una;
uint32_t tmp = *rx_fifo; uint32_t tmp = *rx_fifo;
dst_u8[0] = tmp & 0x000000FF; uint8_t * src = (uint8_t *) &tmp;
if(bytes_rem > 1) {
dst_u8[1] = (tmp & 0x0000FF00) >> 8; while(bytes_rem--)
} {
if(bytes_rem > 2) { *dst_u8++ = *src++;
dst_u8[2] = (tmp & 0x00FF0000) >> 16;
} }
} }
} }
@ -141,29 +144,34 @@ static void _tu_fifo_read_from_const_src_ptr_in_full_words(void * dst, const voi
static void _tu_fifo_write_to_const_dst_ptr_in_full_words(void * dst, const void * src, uint16_t len) static void _tu_fifo_write_to_const_dst_ptr_in_full_words(void * dst, const void * src, uint16_t len)
{ {
volatile uint32_t * tx_fifo = (volatile uint32_t *) dst; volatile uint32_t * tx_fifo = (volatile uint32_t *) dst;
uint8_t * src_u8 = (uint8_t *)src;
// Optimize for fast word copies
typedef struct{
uint32_t val;
} __attribute((__packed__)) unaligned_uint32_t;
unaligned_uint32_t* src_una = (unaligned_uint32_t *) src;
// Pushing full available 32 bit words to FIFO // Pushing full available 32 bit words to FIFO
uint16_t const full_words = len >> 2; uint16_t full_words = len >> 2;
for(uint16_t i = 0; i < full_words; i++){ while(full_words--)
uint32_t temp32; {
memcpy(&temp32, src_u8, 4); *tx_fifo = src_una->val;
*tx_fifo = temp32; src_una++;
src_u8 += 4;
} }
// Write the remaining 1-3 bytes into FIFO // Write the remaining 1-3 bytes into FIFO
uint8_t bytes_rem = len & 0x03; uint8_t bytes_rem = len & 0x03;
if(bytes_rem){ if(bytes_rem){
uint32_t tmp_word = 0; uint8_t * src_u8 = (uint8_t *) src_una;
tmp_word |= src_u8[0]; uint32_t tmp = 0;
if(bytes_rem > 1){ uint8_t * dst_u8 = (uint8_t *)&tmp;
tmp_word |= (uint32_t)(src_u8[1]) << 8;
while(bytes_rem--)
{
*dst_u8++ = *src_u8++;
} }
if(bytes_rem > 2){ *tx_fifo = tmp;
tmp_word |= (uint32_t)(src_u8[2]) << 16;
}
*tx_fifo = tmp_word;
} }
} }
@ -209,30 +217,31 @@ static void _ff_push_n(tu_fifo_t* f, void const * data, uint16_t n, uint16_t wRe
uint16_t nLin = (f->depth - wRel) * f->item_size; uint16_t nLin = (f->depth - wRel) * f->item_size;
uint16_t nWrap = (n - nLin) * f->item_size; uint16_t nWrap = (n - nLin) * f->item_size;
uint8_t * dst_u8 = (uint8_t *)(f->buffer + (wRel * f->item_size)); // Optimize for fast word copies
typedef struct{
uint32_t val;
} __attribute((__packed__)) unaligned_uint32_t;
unaligned_uint32_t* dst = (unaligned_uint32_t*)(f->buffer + (wRel * f->item_size));
volatile uint32_t * rx_fifo = (volatile uint32_t *) data; volatile uint32_t * rx_fifo = (volatile uint32_t *) data;
CFG_TUSB_MEM_ALIGN uint32_t tmp;
// Write full words of linear part to buffer // Write full words of linear part to buffer
uint16_t full_words = nLin >> 2; uint16_t full_words = nLin >> 2;
uint8_t rem = nLin - (full_words << 2);
while(full_words--) while(full_words--)
{ {
tmp = *rx_fifo; dst->val = *rx_fifo;
memcpy(dst_u8, &tmp, 4); dst++;
// dst_u8[0] = tmp & 0x000000FF;
// dst_u8[1] = (tmp & 0x0000FF00) >> 8;
// dst_u8[2] = (tmp & 0x00FF0000) >> 16;
// dst_u8[3] = (tmp & 0xFF000000) >> 24;
dst_u8 += 4;
} }
uint8_t * dst_u8;
uint8_t rem = nLin & 0x03;
// Handle wrap around // Handle wrap around
if (rem > 0) if (rem > 0)
{ {
dst_u8 = (uint8_t *)dst;
uint8_t remrem = tu_min16(nWrap, 4-rem); uint8_t remrem = tu_min16(nWrap, 4-rem);
nWrap -= remrem; nWrap -= remrem;
tmp = *rx_fifo; uint32_t tmp = *rx_fifo;
uint8_t * src_u8 = ((uint8_t *) &tmp); uint8_t * src_u8 = ((uint8_t *) &tmp);
while(rem--) while(rem--)
{ {
@ -299,25 +308,32 @@ static void _ff_pull_n(tu_fifo_t* f, void * p_buffer, uint16_t n, uint16_t rRel,
uint16_t nLin = (f->depth - rRel) * f->item_size; uint16_t nLin = (f->depth - rRel) * f->item_size;
uint16_t nWrap = (n - nLin) * f->item_size; uint16_t nWrap = (n - nLin) * f->item_size;
// Optimize for fast word copies
typedef struct{
uint32_t val;
} __attribute((__packed__)) unaligned_uint32_t;
unaligned_uint32_t* src = (unaligned_uint32_t*)(f->buffer + (rRel * f->item_size));
volatile uint32_t * tx_fifo = (volatile uint32_t *) p_buffer; volatile uint32_t * tx_fifo = (volatile uint32_t *) p_buffer;
uint8_t * src_u8 = f->buffer + (rRel * f->item_size);
CFG_TUSB_MEM_ALIGN uint32_t tmp;
// Pushing full available 32 bit words to FIFO // Pushing full available 32 bit words to FIFO
uint16_t full_words = nLin >> 2; uint16_t full_words = nLin >> 2;
uint8_t rem = nLin - (full_words << 2);
while(full_words--) while(full_words--)
{ {
memcpy(&tmp, src_u8, 4); *tx_fifo = src->val;
*tx_fifo = tmp; src++;
src_u8 += 4;
} }
uint8_t * src_u8;
uint8_t rem = nLin & 0x03;
// Handle wrap around - do it manually as these are only 4 bytes and its faster without memcpy // Handle wrap around - do it manually as these are only 4 bytes and its faster without memcpy
if (rem > 0) if (rem > 0)
{ {
src_u8 = (uint8_t *) src;
uint8_t remrem = tu_min16(nWrap, 4-rem); uint8_t remrem = tu_min16(nWrap, 4-rem);
nWrap -= remrem; nWrap -= remrem;
uint32_t tmp;
uint8_t * dst_u8 = (uint8_t *)&tmp; uint8_t * dst_u8 = (uint8_t *)&tmp;
while(rem--) while(rem--)
{ {