Solution: Intel Atom
Solution: Reboot until it works (NVIDIA driver is closed source)
Solution:
echo "1" > /sys/bus/pci/devices/0000\:03\:00.0/remove echo "1" > /sys/bus/pci/rescan
Solution: Always "cudaMemcpyDeviceToHost" data for inspection.
#!/bin/bash OOPS=`dmesg | grep "Oops"` if [ -n "$OOPS" ]; then echo "ERROR: nvidia driver crashed during boot." fi MLX=`lspci -k | grep "mlx4_core"` if [ -z "$MLX" ]; then echo "ERROR: mlx4_core is not assigned to ib card." fi IFCONFIG=`ifconfig | grep 192.168.11.72` if [ -z "$IFCONFIG" ]; then echo "ERROR: IP is not set." fi IBHOSTS=`ibhosts | grep gpudirect` if [ -z "$IBHOSTS" ]; then echo "ERROR: Infiniband hostname not set." fi
// From nv-p2p.h
/*
* @brief
* Make the pages underlying a range of GPU virtual memory
* accessible to a third-party device.
*
...
*/
int nvidia_p2p_get_pages(uint64_t p2p_token, uint32_t va_space,
uint64_t virtual_address,
uint64_t length,
struct nvidia_p2p_page_table **page_table,
void (*free_callback)(void *data),
void *data);
// From nv_peer_mem.c
static int __init nv_mem_client_init(void)
{
strcpy(nv_mem_client.name, DRV_NAME);
strcpy(nv_mem_client.version, DRV_VERSION);
reg_handle = ib_register_peer_memory_client(&nv_mem_client,
&mem_invalidate_callback);
if (!reg_handle)
return -EINVAL;
return 0;
}
// From verbs.h
/**
* ibv_reg_mr - Register a memory region
*/
struct ibv_mr *ibv_reg_mr(struct ibv_pd *pd, void *addr,
size_t length, int access);
// From kiro-rdma.h
#ifdef GPUDIRECT
void *mem_handle = mem;
int error;
if (!mem_handle) {
error = cudaMalloc (&mem_handle, mem_size);
...
*mr = ibv_reg_mr (pd, mem_handle, mem_size, access);