Why GPU code does not speed up CPU ?

7 visualizaciones (últimos 30 días)
Marcin
Marcin el 6 de Feb. de 2014
Respondida: Yair Altman el 16 de Mzo. de 2014
Dear All,
I've tried to speed up calculations with Parallel Computing Toolbox. Program calculate Lyapounov exponent in 2d disorder systems. CPU code was simply changed to gpu version (all with gpuArrays). Unfourtuanetlly I can't see any speed up - even worse - code is significantly slover. Below I present two versions of code: CPU and GPU. I would like to ask for help.
%CPU
global W_E_LAMBDA;
n0 = 10;
n_save = 100;
epsilon = 4.0e-2;
M_vec = [25:25:125]';
W_vec = [15; 14; 13; 12; 11; 10; 9; 8; 7; 6.5; 6; 5.5; 5; 4.5;];
E_vec = [0];
W_E_LAMBDA = cell(length(W_vec),length(E_vec));
tic;
for W_i = 1:length(W_vec)
W = W_vec(W_i);
for E_i=1:length(E_vec);
E = E_vec(E_i);
file_name_L = ['L_W.' num2str(W) '_E.' num2str(E) '.txt'];
LAMBDA = [];
for M_i = 1:length(M_vec)
M = M_vec(M_i);
l_m_n_vec = [];
file_name_l = ['l_W.' num2str(W) '_E.' num2str(E) '_M.' num2str(M) '.txt'];
file_l_ID = fopen(file_name_l,'w');
H_n_0 = (diag(ones(1,M-1),-1) + diag(ones(1,M-1),1));
H_n_0(1,M) = 1;
H_n_0(M,1) = 1;
eye_M = eye(M);
B = eye_M;
An = eye_M;
An_m1 = zeros(M);
An_p1 = zeros(M);
b_n = sqrt(M);
c_n = 0;
d_n = 0;
l_m_n = 0;
ratio = 1;
ratio_MK = 1;
n = 0;
n_slice = 0;
disorder_no = 1;
disorder = -W/2 + W*rand(M);
while(ratio>=epsilon)
n = n + 1;
n_slice = n_slice + 1;
disorder_slice = diag(disorder(:,n_slice));
An_p1 = (E*eye_M - (H_n_0 - disorder_slice ) )*An - An_m1;
if(mod(n,n0) == 0)
inv_An = An^(-1);
An_p1 = An_p1*inv_An;
An = eye_M;
B = B*inv_An/b_n;
b_n = sqrt(sum( sum( abs(B*An_p1^(-1)).^2 ) ));
c_n = c_n + log(b_n);
d_n = d_n + (log(b_n))^2;
l_m_n = -n/c_n;
l_m_n_vec = [l_m_n_vec; -n/c_n];
delta_n = sqrt(d_n/n - (c_n/n)^2);
end
An_m1 = An;
An = An_p1;
if(mod(n,size(disorder,2) == 0))
n_slice = 0;
disorder_no = disorder_no + 1;
disorder = -W/2 + W*rand(M);
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
if(mod(n,n_save) == 0)
ratio_MK = sqrt(n*d_n/c_n^2 - 1);
zakres = floor(length(l_m_n_vec)*0.05+1):length(l_m_n_vec);
fprintf(file_l_ID,'%d %f \n',n,l_m_n_vec(end));
ratio = abs(std(l_m_n_vec(zakres),1)/mean(l_m_n_vec(zakres)));
fprintf('----------------------------------------------\n');
fprintf('E_i = %d / %d \n',E_i, length(E_vec))
fprintf('V_0_i = %d / %d \n',W_i, length(W_vec))
fprintf('M = %d / %d \n',M_i,length(M_vec));
fprintf('disorder_no = %d\n',disorder_no);
fprintf('eps = %f\n',ratio);
fprintf('eps_MK = %f\n', ratio_MK );
fprintf('----------------------------------------------\n');
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
end
LAMBDA = [LAMBDA; mean(l_m_n_vec(zakres))/M];
fclose(file_l_ID);
end
W_E_LAMBDA{W_i,E_i} = LAMBDA;
M_LAMBDA = [M_vec LAMBDA];
save(file_name_L,'M_LAMBDA','-ascii');
end
end
matlabpool close;
toc;
%GPU
global W_E_LAMBDA;
n0 = 10;
n_save = 100;
epsilon = 4.0e-2;
M_vec = [25:25:125]';
W_vec = [15; 14; 13; 12; 11; 10; 9; 8; 7; 6.5; 6; 5.5; 5; 4.5;];
E_vec = [0];
W_E_LAMBDA = cell(length(W_vec),length(E_vec));
global W_E_LAMBDA;
W_E_LAMBDA = cell(length(W_vec),length(E_vec));
tic;
for W_i = 1:length(W_vec)
for E_i=1:length(E_vec);
W = W_vec(W_i);
E = E_vec(E_i);
LAMBDA = gpuArray([]);
file_name_L = ['L_W.' num2str(W) '_E.' num2str(E) '.txt'];
for M_i = 1:length(M_vec)
M = M_vec(M_i);
l_m_n_vec = [];
n = 0;
b_n = sqrt(M);
c_n = 0;
d_n = 0;
l_m_n = 0;
ratio = 1;
ratio_MK = 1;
n_slice = 0;
disorder_no = 1;
disorder = -W/2 + W*rand(M);
H_n_0 = (diag(ones(1,M-1),-1) + diag(ones(1,M-1),1));
H_n_0(1,M) = 1;
H_n_0(M,1) = 1;
eye_M = eye(M);
B = eye_M;
An = eye_M;
An_m1 = zeros(M);
An_p1 = zeros(M);
file_name_l = ['l_W.' num2str(W) '_E.' num2str(E) '_M.' num2str(M) '.txt'];
file_l_ID = fopen(file_name_l,'w');
%sending to GPU
W = gpuArray(W);
E = gpuArray(E);
M = gpuArray(M);
l_m_n_vec = gpuArray(l_m_n_vec);
n = gpuArray(n);
b_n = gpuArray(b_n);
c_n = gpuArray(c_n);
d_n = gpuArray(d_n);
l_m_n = gpuArray(l_m_n);
epsilon = gpuArray(epsilon);
ratio = gpuArray(ratio);
ratio_MK = gpuArray(ratio_MK);
n_slice = gpuArray(n_slice);
disorder_no = gpuArray(disorder_no);
disorder = gpuArray(disorder);
H_n_0 = gpuArray(H_n_0);
B = gpuArray(B);
An = gpuArray(An);
An_m1 = gpuArray(An_m1);
An_p1 = gpuArray(An_p1);
while(ratio>=epsilon)
n = n + 1;
n_slice = n_slice + 1;
disorder_slice = diag(disorder(:,n_slice));
An_p1 = (E*eye_M - (H_n_0 - disorder_slice ) )*An - An_m1;
if(mod(n,n0) == 0)
inv_An = An^(-1);
An_p1 = An_p1*inv_An;
An = gpuArray.eye(M);
B = B*inv_An/b_n;
b_n = sqrt(sum( sum( abs(B*An_p1^(-1)).^2 ) ));
c_n = c_n + log(b_n);
d_n = d_n + (log(b_n))^2;
l_m_n = -n/c_n;
l_m_n_vec = [l_m_n_vec; -n/c_n];
delta_n = sqrt(d_n/n - (c_n/n)^2);
end
An_m1 = An;
An = An_p1;
if(mod(n,size(disorder,2) == 0))
n_slice = 0;
disorder_no = disorder_no + 1;
disorder = -W/2 + W*gpuArray.rand(M);
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
if(mod(n,n_save) == 0)
ratio_MK = sqrt(n*d_n/c_n^2 - 1);
zakres = floor(length(l_m_n_vec)*0.05+1):length(l_m_n_vec);
fprintf(file_l_ID,'%d %f \n',n,gather(l_m_n_vec(end)));
ratio = abs(std(l_m_n_vec(zakres),1)/mean(l_m_n_vec(zakres)));
fprintf('----------------------------------------------\n');
fprintf('E_i = %d / %d \n',E_i, length(E_vec))
fprintf('V_0_i = %d / %d \n',W_i, length(W_vec))
fprintf('M = %d / %d \n',M_i,length(M_vec));
fprintf('disorder_no = %d\n',disorder_no);
fprintf('eps = %f\n',ratio);
fprintf('eps_MK = %f\n', ratio_MK );
fprintf('----------------------------------------------\n');
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
end
LAMBDA = [LAMBDA; mean(l_m_n_vec(zakres))/M];
fclose(file_l_ID);
end
W_E_LAMBDA{W_i,E_i} = gather(LAMBDA);
M_LAMBDA = [M_vec LAMBDA];
save(file_name_L,'M_LAMBDA','-ascii');
end
end
toc;
As far I know I've avoided gpu-cpu transfer of data as much as possible, everything is going on GPU, only results are gathered. Where I made mistake ?
Best regards Marcin

Respuestas (1)

Yair Altman
Yair Altman el 16 de Mzo. de 2014
While data transfer is indeed a well-known GPU bottleneck, it is not the only limiting factor. Conditionals/branching (which you use in your code) also hinder GPU performance. Similarly, if the problem is not massively parallel you are not expected to see much (if any) speedup.

Categorías

Más información sobre Deep Learning Toolbox en Help Center y File Exchange.

Etiquetas

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!

Translated by