Here's some C and PRU assembly code I wrote to see how fast the PRU can write to system (DDR) memory.
And here's the assembly:
Here's the output I get, about 200MB/sec:
If I crank up the number of bytes written by SBBO from 4 to 8 (in the SBBO and ADD after LOOP0), then I think it ends up writing the contents of r10 and r11 into memory, and I get 320MB/sec. If I crank it up to 16 bytes per write, I get 450MB/sec.
So the PRU really can write very quickly to system RAM.
// Loads a .bin file into a BeagleBone PRU and then interacts with it
// in shared PRU memory and (system-wide) DDR memory.
//
// Pass in the filename of the .bin file on the command line, eg:
// $ ./pru_loader foo.bin
//
// Compile with:
// gcc -std=gnu99 -o pru_loader pru_loader.c -lprussdrv
#include <unistd.h>
#include <stdio.h>
#include <inttypes.h>
#include <prussdrv.h>
#include <pruss_intc_mapping.h>
int main(int argc, char **argv) {
if (argc != 2) {
printf("Usage: %s pru_code.bin\n", argv[0]);
return 1;
}
// If this segfaults, make sure you're executing as root.
prussdrv_init();
if (prussdrv_open(PRU_EVTOUT_0) == -1) {
printf("prussdrv_open() failed\n");
return 1;
}
tpruss_intc_initdata pruss_intc_initdata = PRUSS_INTC_INITDATA;
prussdrv_pruintc_init(&pruss_intc_initdata);
// Pointer into the 8KB of shared PRU DRAM
volatile void *shared_memory_void = NULL;
// Useful if we're storing data there in 4-byte chunks
volatile uint32_t *shared_memory = NULL;
prussdrv_map_prumem(PRUSS0_SHARED_DATARAM, (void **) &shared_memory_void);
shared_memory = (uint32_t *) shared_memory_void;
// Pointer into the DDR RAM mapped by the uio_pruss kernel module.
volatile void *shared_ddr = NULL;
prussdrv_map_extmem((void **) &shared_ddr);
unsigned int shared_ddr_len = prussdrv_extmem_size();
unsigned int physical_address = prussdrv_get_phys_addr((void *) shared_ddr);
printf("%u bytes of shared DDR available.\n Physical (PRU-side) address:%x\n",
shared_ddr_len, physical_address);
printf("Virtual (linux-side) address: %p\n\n", shared_ddr);
// We'll use the first 8 bytes of PRU memory to tell it where the
// shared segment of system memory is.
shared_memory[0] = physical_address;
shared_memory[1] = shared_ddr_len;
// Change to 0 to use PRU0
int which_pru = 1;
prussdrv_exec_program(which_pru, argv[1]);
for (int i = 0; i < 10; i++) {
sleep(1);
// See if it's successfully writing the physical address of each word at
// the (virtual, from our viewpoint) address
printf("DDR[%d] is: %p / 0x%x\n", i, ((unsigned int *)shared_ddr) + i,
((unsigned int *) shared_ddr)[i]);
int passes = shared_memory[0];
int bytes_written = passes * shared_ddr_len;
printf("Bytes written: %d\n", bytes_written);
}
// Wait for the PRU to let us know it's done
prussdrv_pru_wait_event(PRU_EVTOUT_0);
printf("All done\n");
prussdrv_pru_disable(which_pru);
prussdrv_exit();
return 0;
}
And here's the assembly:
.origin 0
.entrypoint TOP
#define DDR r29
#define DDR_SIZE r28
#define SHARED_RAM r27
#define SHARED_RAM_ADDRESS 0x10000
TOP:
// Enable OCP master ports in SYSCFG register
LBCO r0, C4, 4, 4
CLR r0, r0, 4
SBCO r0, C4, 4, 4
MOV SHARED_RAM, SHARED_RAM_ADDRESS
// From shared RAM, grab the address of the shared DDR segment
LBBO DDR, SHARED_RAM, 0, 4
// And the size of the segment from SHARED_RAM + 4
LBBO DDR_SIZE, SHARED_RAM, 4, 4
// BIGLOOP is one pass overwriting the shared DDR memory segment
mov r12, 0
mov r14, 10000
BIGLOOP:
// Start at the beginning of the segment
MOV r10, DDR
ADD r11, DDR, DDR_SIZE
// Tight loop writing the physical address of each word into that word
LOOP0:
SBBO r10, r10, 0, 4
ADD r10, r10, 4
// XXX: This means r10 < r11, opposite what I expected!
QBLT LOOP0, r11, r10
ADD r12, r12, 1
SBBO r12, SHARED_RAM, 0, 4
QBGT BIGLOOP, r12, r14
// Interrupt the host so it knows we're done
MOV r31.b0, 19 + 16
// Don't forget to halt!
HALT
Here's the output I get, about 200MB/sec:
262144 bytes of shared DDR available.
Physical (PRU-side) address:9e6c0000
Virtual (linux-side) address: 0xb6d78000
DDR[0] is: 0xb6d78000 / 0x9e6c0000
Bytes written: 200540160
DDR[1] is: 0xb6d78004 / 0x9e6c0004
Bytes written: 401342464
DDR[2] is: 0xb6d78008 / 0x9e6c0008
Bytes written: 601882624
DDR[3] is: 0xb6d7800c / 0x9e6c000c
Bytes written: 802160640
DDR[4] is: 0xb6d78010 / 0x9e6c0010
Bytes written: 1002176512
DDR[5] is: 0xb6d78014 / 0x9e6c0014
Bytes written: 1202454528
DDR[6] is: 0xb6d78018 / 0x9e6c0018
Bytes written: 1402470400
DDR[7] is: 0xb6d7801c / 0x9e6c001c
Bytes written: 1602748416
DDR[8] is: 0xb6d78020 / 0x9e6c0020
Bytes written: 1802764288
DDR[9] is: 0xb6d78024 / 0x9e6c0024
Bytes written: 2003042304
All done
If I crank up the number of bytes written by SBBO from 4 to 8 (in the SBBO and ADD after LOOP0), then I think it ends up writing the contents of r10 and r11 into memory, and I get 320MB/sec. If I crank it up to 16 bytes per write, I get 450MB/sec.
So the PRU really can write very quickly to system RAM.