Hi,
Thanks for sharing this library. It defintively has a huge potential. I have an issue that I have experienced not only with metalcompute library but also in my own C-extensions of Metal through Swift for Python. It seems that the allocated buffers do not get released. I wonder if you have experienced this.
Below you can see a simplified code for the calculation of ultrasound fields using Metal. The function allocating and ( in principle) deallocating the buffers is called ForwardPropagationMetal
. You can see it does very simular tasks as in the examples: create buffers, copy from numpy, run kernel and recover results. In this code, I made in purpose to call this function N-times. If you let it running it, you will see in the activityMonitor how the memory continues to grow and eventually you will end with a CouldNotMakeBuffer
error. To ru it, just save it an run it with python <script>.py <substring of GPU> <Number of iterations>
, for example :
python DemoMetalcomputeRunOutMemory.py 'M1' 100000
.
This problems happens either in M1 or AMD GPUs. I tested this with my M1 Max Pro and with an external AMD PRO W6800 GPU via a thunderbolt3-connected enclosure with an iMac Pro. Both using latest Monterery and XCode versions.
The Swift code is supposed to release the buffers but I haven't found why this is not occurring. As mentioned above, I experience the same using if using custom made Swift functions that encapsulate creation of buffers and calls to metal functions.
As a side note, OpenCL is still supported in Monterey and for M1 processors (something that was not supposed to be supported anymore, but I'm not really complaining). I can run similar code using pyopencl with no memory issues. But in pyopencl, the library controls the deallocation of OpenCL buffers as any other Python object. So I wonder if Swift deallocator is having some sort of blockage and if there is a way to force the deallocation once the call to Metal compute has been completed.
Thanks for any hint you could provide,
Happy 2022!
Sam
import numpy as np
import matplotlib.pyplot as plt
import gc
import metalcompute as mc
RayleighMetalHeader="""
#include <metal_stdlib>
using namespace metal;
#define pi 3.141592653589793
typedef float FloatingType;
#define ppCos pCos
#define mr2 intparams[0]
#define mr1 intparams[1]
#define mr1step intparams[2]
#define c_wvnb_real c_wvnb[0]
#define c_wvnb_imag c_wvnb[1]
kernel void ForwardPropagationKernel( const device int * intparams [[buffer(0)]],
const device FloatingType * c_wvnb[[buffer(1)]],
const device FloatingType *r2pr [[buffer(2)]],
const device FloatingType *r1pr [[buffer(3)]],
const device FloatingType *a1pr [[buffer(4)]],
const device FloatingType *u1_real [[buffer(5)]],
const device FloatingType *u1_imag [[buffer(6)]],
device FloatingType *py_data_u2_real [[buffer(7)]],
device FloatingType *py_data_u2_imag [[buffer(8)]],
uint si2 [[thread_position_in_grid]])
{
FloatingType dx,dy,dz,R,r2x,r2y,r2z;
FloatingType temp_r,tr ;
FloatingType temp_i,ti,pCos,pSin ;
if ( si2 < mr2)
{
temp_r = 0;
temp_i = 0;
r2x=r2pr[si2*3];
r2y=r2pr[si2*3+1];
r2z=r2pr[si2*3+2];
for (int si1=0; si1<mr1; si1++)
{
// In matlab we have a Fortran convention, in Python-numpy, we have the C-convention for matrixes (hoorray!!!)
dx=r1pr[si1*3+mr1step*si2]-r2x;
dy=r1pr[si1*3+1+mr1step*si2]-r2y;
dz=r1pr[si1*3+2+mr1step*si2]-r2z;
R=sqrt(dx*dx+dy*dy+dz*dz);
ti=(exp(R*c_wvnb_imag)*a1pr[si1]/R);
tr=ti;
pSin=sincos(R*c_wvnb_real,ppCos);
tr*=(u1_real[si1]*pCos+u1_imag[si1]*pSin);
ti*=(u1_imag[si1]*pCos-u1_real[si1]*pSin);
temp_r +=tr;
temp_i +=ti;
}
R=temp_r;
temp_r = -temp_r*c_wvnb_imag-temp_i*c_wvnb_real;
temp_i = R*c_wvnb_real-temp_i*c_wvnb_imag;
py_data_u2_real[si2]=temp_r/(2*pi);
py_data_u2_imag[si2]=temp_i/(2*pi);
}
}
"""
metaldev = None
prgmetal = None
def SpeedofSoundWater(Temperature):
Xcoeff = [0.00000000314643 ,-0.000001478,0.000334199,-0.0580852,5.03711,1402.32]
speed = np.polyval(Xcoeff,Temperature)
return speed
def GenerateSurface(lstep,Diam,Foc):
Tx = {}
rInt=0
rExt=Diam/2
Beta1= np.arcsin(rInt/Foc)
Beta2= np.arcsin(rExt/Foc)
DBeta= Beta2-Beta1
ArcC = DBeta*Foc
nrstep = np.ceil(ArcC/lstep);
BetaStep = DBeta/nrstep;
BetaC = np.arange(Beta1+BetaStep/2,Beta1+BetaStep*(1/2 + nrstep),BetaStep)
Ind=0
SingElem = np.zeros((0,3))
N = np.zeros((0,3))
ds = np.zeros((0,1))
VertDisplay= np.zeros((0,3))
FaceDisplay= np.zeros((0,4),int)
for nr in range(len(BetaC)):
Perim = np.sin(BetaC[nr])*Foc*2*np.pi
nAlpha = np.ceil(Perim/lstep)
sAlpha = 2*np.pi/nAlpha
AlphaC = np.arange(sAlpha/2,sAlpha*(1/2 + nAlpha ),sAlpha)
SingElem=np.vstack((SingElem,np.zeros((len(AlphaC),3))))
N = np.vstack((N,np.zeros((len(AlphaC),3))))
ds = np.vstack((ds,np.zeros((len(AlphaC),1))))
VertDisplay= np.vstack((VertDisplay,np.zeros((len(AlphaC)*4,3))))
FaceDisplay= np.vstack((FaceDisplay,np.zeros((len(AlphaC),4),int)))
zc = -np.cos(BetaC[nr])*Foc
Rc = np.sin(BetaC[nr])*Foc
B1 = BetaC[nr]-BetaStep/2
B2 = BetaC[nr]+BetaStep/2
if nr==0:
Rc1=0
else:
Rc1 = np.sin(B1)*Foc
Rc2 = np.sin(B2)*Foc
zc1 =-np.cos(B1)*Foc
zc2 =-np.cos(B2)*Foc
SingElem[Ind:,0] = Rc*np.cos(AlphaC)
SingElem[Ind:,1] = Rc*np.sin(AlphaC)
SingElem[Ind:,2] = zc
A1 = AlphaC-sAlpha/2;
A2 = AlphaC+sAlpha/2;
ds[Ind:,0]=Foc**2 *(np.cos(B1) - np.cos(B2))*(A2-A1)
N[Ind:,:] =SingElem[Ind:,:]/np.repeat(np.linalg.norm(SingElem[Ind:,:],axis=1).reshape((len(AlphaC),1)),3,axis=1)
VertDisplay[Ind*4::4,0]= Rc1*np.cos(A1)
VertDisplay[Ind*4::4,1]= Rc1*np.sin(A1)
VertDisplay[Ind*4::4,2]= zc1
VertDisplay[Ind*4+1::4,0]= Rc1*np.cos(A2)
VertDisplay[Ind*4+1::4,1]= Rc1*np.sin(A2)
VertDisplay[Ind*4+1::4,2]= zc1
VertDisplay[Ind*4+2::4,0]= Rc2*np.cos(A1)
VertDisplay[Ind*4+2::4,1]= Rc2*np.sin(A1)
VertDisplay[Ind*4+2::4,2]= zc2
VertDisplay[Ind*4+3::4,0]= Rc2*np.cos(A2)
VertDisplay[Ind*4+3::4,1]= Rc2*np.sin(A2)
VertDisplay[Ind*4+3::4,2]= zc2
FaceDisplay[Ind:,0] =(Ind+np.arange(len(AlphaC)))*4
FaceDisplay[Ind:,1] =(Ind+np.arange(len(AlphaC)))*4+1
FaceDisplay[Ind:,2] =(Ind+np.arange(len(AlphaC)))*4+3
FaceDisplay[Ind:,3] =(Ind+np.arange(len(AlphaC)))*4+2
Ind+=len(AlphaC)
Tx['center'] = SingElem
Tx['ds'] = ds
Tx['normal'] = N
Tx['VertDisplay'] = VertDisplay
Tx['FaceDisplay'] = FaceDisplay
Tx['Beta1']=Beta1
Tx['Beta2']=Beta2
return Tx
def GenerateFocusTx(f,Foc,Diam,c,PPWSurface=4):
wavelength = c/f;
lstep = wavelength/PPWSurface;
Tx = GenerateSurface(lstep,Diam,Foc)
return Tx
def InitMetal(DeviceName='M1'):
global metaldev
global prgmetal
if metaldev is not None:
del metaldev
del prgmetal
metaldev = None
prgmetal = None
n=0
sel_n = -1
for dev in mc.get_devices():
print(dev.deviceName)
if DeviceName in dev.deviceName:
sel_n =n
break
n+=1
if sel_n ==-1:
raise SystemError("No Metal device containing name [%s]" %(DeviceName))
else:
metaldev = mc.Device(sel_n)
print('Selecting device: ', metaldev)
prgmetal = metaldev.kernel(RayleighMetalHeader).function("ForwardPropagationKernel")
def ForwardSimpleMetal(cwvnb,center,ds,u0,rf,u0step=0):
global metaldev
global prgmetal
if u0step!=0:
mr1=u0step
assert(mr1*rf.shape[0]==center.shape[0])
else:
mr1=center.shape[0]
d_r2pr = metaldev.buffer(rf)
d_r1pr = metaldev.buffer(center) # Create mc buffer as copy of b_np
d_u1realpr=metaldev.buffer(np.real(u0).copy())
d_u1imagpr=metaldev.buffer(np.imag(u0).copy())
d_a1pr = metaldev.buffer(ds)
d_u2realpr = metaldev.buffer(rf.shape[0]*4)
d_u2imagpr = metaldev.buffer(rf.shape[0]*4)
intparams = np.zeros(3,np.int32)
intparams[0]=rf.shape[0]
intparams[1]=mr1
intparams[2]=u0step
cwvnb_a=np.zeros(2,np.float32)
cwvnb_a[0]=np.real(cwvnb)
cwvnb_a[1]=np.imag(cwvnb)
d_intparams=metaldev.buffer(intparams)
d_cwvnb=metaldev.buffer(cwvnb_a)
handle = prgmetal(rf.shape[0],
d_intparams,
d_cwvnb,
d_r2pr,
d_r1pr,
d_a1pr,
d_u1realpr,
d_u1imagpr,
d_u2realpr,
d_u2imagpr)
del handle
u2_real=np.frombuffer(d_u2realpr,dtype='f')
u2_imag=np.frombuffer(d_u2imagpr,dtype='f')
u2=u2_real+1j*u2_imag
#tried to explicitly deference buffers and call garbage collector...
del d_intparams
del d_cwvnb
del d_r2pr
del d_r1pr
del d_a1pr
del d_u1realpr
del d_u1imagpr
del d_u2realpr
del d_u2imagpr
gc.collect()
return u2
def Main(args):
InitMetal(args.GPU)
bPlot=True
#some simple settings for the domain for the kernel simulation
xfmin=-8e-2
xfmax=8e-2
yfmin=-8e-2
yfmax=8e-2
zfmin=3e-2
zfmax=9e-2
extlay={}
TemperatureWater=20.0
extlay['c']=SpeedofSoundWater(TemperatureWater)
### we create a Tx
Freq=250e3
Diam=70e-3
Foc=70e-3
lstep=1500/Freq/4
SpatialStep=lstep
Tx = GenerateFocusTx(Freq,Foc,Diam,1500)
#dy default the focal point is at 0,0,0, we shift it to make the back of the Tx at z=0
Tx['center'][:,2]+=Foc
Tx['VertDisplay'][:,2]+=Foc
xfield = np.linspace(xfmin,xfmax,int(np.ceil((xfmax-xfmin)/SpatialStep)+1))
yfield = np.linspace(yfmin,yfmax,int(np.ceil((yfmax-yfmin)/SpatialStep)+1))
zfield = np.linspace(zfmin,zfmax,int(np.ceil((zfmax-zfmin)/SpatialStep)+1))
nxf=len(xfield)
nyf=len(yfield)
nzf=len(zfield)
xp,yp,zp=np.meshgrid(xfield,yfield,zfield,indexing='ij')
rf=np.hstack((np.reshape(xp,(nxf*nyf*nzf,1)),np.reshape(yp,(nxf*nyf*nzf,1)), np.reshape(zp,(nxf*nyf*nzf,1)))).astype(np.float32)
#focal point coordinates
cx=np.argmin(np.abs(xfield))
cy=np.argmin(np.abs(yfield))
cz=np.argmin(np.abs(zfield-Foc))
Att=0.0
cwvnb_extlay=np.array(2*np.pi*Freq/1500+(-1j*Att)).astype(np.complex64)
u0=np.ones((Tx['center'].shape[0],1),np.complex64)
for n in range(args.NIterations):
u2=ForwardSimpleMetal(cwvnb_extlay,Tx['center'].astype(np.float32),
Tx['ds'].astype(np.float32),u0,rf)
u2=np.reshape(u2,xp.shape)
u2=np.abs(u2)
if bPlot:
plt.figure(figsize=(16,5))
plt.subplot(1,4,1)
psel=20*np.log10(u2/np.max(u2))
plt.contourf(psel[:,cy,:].T,[-10,-6,-3,0],extent=(xfmin,xfmax,zfmin,zfmax),\
cmap=plt.cm.jet)
plt.colorbar()
plt.subplot(1,4,2)
plt.imshow(u2[:,cy,:].T/1e6,extent=(xfmin,xfmax,zfmin,zfmax),cmap=plt.cm.jet)
plt.colorbar()
plt.subplot(1,4,3)
plt.contourf(psel[cx,:,:].T,[-10,-6,-3,0],extent=(xfmin,xfmax,zfmin,zfmax),\
cmap=plt.cm.jet)
plt.colorbar()
plt.subplot(1,4,4)
plt.imshow(u2[cx,:,:].T/1e6,extent=(xfmin,xfmax,zfmin,zfmax),cmap=plt.cm.jet)
plt.colorbar()
plt.show()
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("GPU", help="select Metal GPU (just need partial substring, 'M1','6800', etc",default='M1')
parser.add_argument("NIterations", help="Number of iterations of metal kernel",type=int,default=1000)
args = parser.parse_args()
print(args)
Main(args)
bug