Scientific Computing, TU Berlin, WS 2019/2020, Lecture 28
Jürgen Fuhrmann, WIAS Berlin
https://juliagpu.gitlab.io/CUDA.jl/tutorials/introduction/
Necessary packages
using CUDAdrv, CUDAnative, CuArrays
using LinearAlgebra
using BenchmarkTools
Simple function: add vector on CPU
function sequential_add!(y, x)
for i in eachindex(y, x)
@inbounds y[i] += x[i]
end
return nothing
end
function parallel_add!(y, x)
Threads.@threads for i in eachindex(y, x)
@inbounds y[i] += x[i]
end
return nothing
end
Create vectors on the CPU:
N=2_000_000
T=Float32
x=rand(T,N)
y=rand(T,N)
Create vectors on the GPU:
x_gpu = CuArray{T}(undef, N);
y_gpu = CuArray{T}(undef, N);
@btime begin
copyto!($x_gpu,$x);
copyto!($y_gpu,$y);
end
function gpu_add_bcast!(y,x)
CuArrays.@sync y .+= x
end
Compare timings
@btime sequential_add!($x,$y);
@btime parallel_add!($x,$y);
@btime gpu_add_bcast!($x_gpu, $y_gpu);
We know here that the CPU has memory access issues, so 4 threads don't give too much of speedup
function gpu_add1!(y,x)
function _kernel!(y, x)
for i = 1:length(y)
@inbounds y[i] += x[i]
end
return nothing
end
CuArrays.@sync begin
@cuda _kernel!(y,x)
end
end
@btime gpu_add1!($x_gpu, $y_gpu)
Linear indexing is incredibly slow here
Try a more thorough adaptation to CUDA data model provides better performance
function gpu_add2!(y, x;nthreads=256)
function _kernel!(y, x)
index = threadIdx().x
stride = blockDim().x
for i = index:stride:length(y)
@inbounds y[i] += x[i]
end
return nothing
end
CuArrays.@sync begin
@cuda threads=nthreads _kernel!(y,x)
end
end
@btime gpu_add2!($x_gpu, $y_gpu)
Go even further and do proper blocking
(see CUDA.jl tutorial)
function gpu_add3!(y, x;nthreads=256)
function _kernel!(y, x)
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = blockDim().x * gridDim().x
for i = index:stride:length(y)
@inbounds y[i] += x[i]
end
return
end
numblocks = ceil(Int, length(x)/nthreads)
CuArrays.@sync begin
@cuda threads=nthreads blocks=numblocks _kernel!(y,x)
end
end
@btime gpu_add3!($x_gpu, $y_gpu)
using ExtendableSparse
using SparseArrays
using Printf
using IterativeSolvers
Implement two Jacobi preconditioners based just on a diagonal vector
function LinearAlgebra.ldiv!(b,D::CuVector,a)
b.=a./D
end
function LinearAlgebra.ldiv!(b,D::Vector,a)
b.=a./D
end
Create random matrix and problem data on a 3D rectangular grid with 512000 unknowns (we could have done FE assembly here...)
n=80
N=n^3
t=Base.@elapsed begin
M=ExtendableSparse.fdrand(n,n,n,matrixtype=ExtendableSparseMatrix).cscmatrix
u_exact=rand(N)
D=Vector(diag(M))
f=M*u_exact
end
println("Creating matrix: $(t) s")
Copy data onto GPU ... yes, they have sparse matrices there!
t=Base.@elapsed begin
M_gpu=CUSPARSE.CuSparseMatrixCSC(M)
f_gpu=cu(f)
D_gpu=cu(D)
u_exact_gpu=cu(u_exact)
end
println("loading GPU: $(t) s")
Run direct solver
# first run for compiling
u=M\f
t=Base.@elapsed begin
u=M\f
end
println("Direct solution on CPU: $(t)s")
Use cg from IterativeSolvers to solve system with Jacobi preconditioner
# first run for compiling
u,hist=cg(M,f,Pl=D, tol=1.0e-10,log=true)
t=Base.@elapsed begin
u,hist=cg(M,f,Pl=D, tol=1.0e-10,log=true)
end
println("CG solution on CPU: $(t) s ($(hist.iters) iterations), error=$(norm(u_exact-u))")
Do the same on the GPU
... yes, it is the same cg
# first run for compiling
u_gpu,hist=cg(M_gpu,f_gpu,Pl=D_gpu,tol=Float32(1.0e-10),log=true)
t=Base.@elapsed begin
u_gpu,hist=cg(M_gpu,f_gpu,Pl=D_gpu,tol=Float32(1.0e-10),log=true)
end
println("CG solution on GPU: $(t) s ($(hist.iters) iterations), error=$(norm(u_exact_gpu-u_gpu))")
This notebook was generated using Literate.jl.