memcpy bandwith

Problem 
Write a simple IP to measure the memcpy bandwidth
Solution
The solution consists of three steps: design a simple IP using vivado-hls, prepare the embedded system using vivado, and run a simple software using SDK.
1-- (design a simple IP) The following C++ code in Vivado-HLS read some data from the main memory using memcpy function and write back the data to the main memory by the same mechanism.

memcpy_burst

#include <stdio.h>
#include <string.h>

#define N 1024*128

void memcpy_burst(volatile float *a, volatile unsigned int byte_rdoffset, volatile unsigned int byte_wroffset){

#pragma HLS INTERFACE ap_bus port=a
#pragma HLS RESOURCE core=AXI4M variable=a

#pragma HLS RESOURCE core=AXI4LiteS variable=return metadata="-bus_bundle LITE"

#pragma HLS INTERFACE ap_none register     port=byte_rdoffset
#pragma HLS RESOURCE core=AXI4LiteS    variable=byte_rdoffset metadata="-bus_bundle LITE"

#pragma HLS INTERFACE ap_none register     port=byte_wroffset
#pragma HLS RESOURCE core=AXI4LiteS    variable=byte_wroffset metadata="-bus_bundle LITE"

 volatile float buff[N];

  memcpy((float *)buff,(float *)(a+byte_rdoffset/4),N*sizeof(float));
  memcpy((float *)(a+byte_wroffset/4),(float *)buff,N*sizeof(float));
}

2- (prepare the embedded system) If the generated IP is connected to the zynq ACP slave port in the vivado, then we have the following design
3- (SDK sofware) The following c code measures the memcpy throughput using the above IP 

memcpy throuput sw

#include "xmemcpy_burst.h"
#include <xscutimer.h>

#define TIMER_LOAD_VALUE      0xFFFFFFFF
#define N 1024*128

u32 frequency_set(int pl_clock, int divisor0, int divisor1) ;

XScuTimer Timer;
XMemcpy_burst memcpy_burst;
XMemcpy_burst_Config config;

int main()
{
int status;
XMemcpy_burst_Config *CfgPtr;

//timer initialization
XScuTimer_Config *TMRConfigPtr;
TMRConfigPtr = XScuTimer_LookupConfig(XPAR_SCUGIC_0_DEVICE_ID);
XScuTimer_CfgInitialize(&Timer, TMRConfigPtr,TMRConfigPtr->BaseAddr);
XScuTimer_SelfTest(&Timer);

    print("Hello memcpy\n\r");

    CfgPtr = XMemcpy_burst_LookupConfig(XPAR_MEMCPY_BURST_0_DEVICE_ID);
    if(!CfgPtr){
    print("Error looking for AXI DMA config\n\r");
    return XST_FAILURE;
    }

    status = XMemcpy_burst_CfgInitialize(&memcpy_burst,CfgPtr);
    if(status != XST_SUCCESS){
    print("Error initializing DMA\n\r");
    return XST_FAILURE;
    }

float srcData[N];
float dstData[N];

XMemcpy_burst_SetByte_rdoffset(&memcpy_burst, &srcData);
XMemcpy_burst_SetByte_wroffset(&memcpy_burst, &dstData);

    //load the timer
    XScuTimer_LoadTimer(&Timer, TIMER_LOAD_VALUE);
    XScuTimer_Start(&Timer);

    XMemcpy_burst_Start(&memcpy_burst);

    while (!XMemcpy_burst_IsIdle(&memcpy_burst)) ;

    u32 value = XScuTimer_GetCounterValue(&Timer);
    printf("Timer value = %f\n\r", 0.002*(0xFFFFFFFF-value)/666.66);

    print("Bye memcpy\n\r");
    return 0;
}

Results:
if PL frequency is 50 MHz then memcpy throughput is about 198 MByte/sec 
and
if PL frequency is 100 MHz then memcpy throughput is about 324 MByte/sec