意外に遅い? RTX4090 の fp64 性能

二次元 cavity 問題の MAC 法による数値解法(3) – ソフトウェア技術者(ときどき科学)のつぶやき (minosys.com)の記事では CPU を使って計算を実行しました。当然、GPU を使うとどうなるのか?という疑問が湧きます。そこで、手持ちの RTX 4090 の fp64 モードを使って計算してみました。

RTX4090 は fp32 : fp64 = 8 : 1 ですが、CUDA 数が大きいこととベースクロックが速いことから fp64 でも 1TFLOPS 超えを実現しているということになっています。それなりの値を期待したいところですが、格子数30程度では CPU(Intel 13700KF) に軍配が上がることになりました。(経過時間は秒単位。計測には GPU ライブラリの初期化時間は含まない。)

$python3 cavity2d.py
CPU: 3.0593001

$python3 cavity2d_cuda.py
GPU: 49.073562

なんと10倍以上も GPU が遅いという結果となりました。使っている PyTorch ライブラリが遅い可能性や、CUDA 向けにプログラムが最適化されていないという課題はありますが、ちょっと遅すぎません?格子数が上がると差が縮まってきたりするのでしょうか??

以下に PyTorch 版の cavity2d_cuda.py を記載します。

#! -*- coding: utf-8 -*-

import datetime
import sys
import numpy as np
import matplotlib.pyplot as plt
import torch

# 二次元 cavity 問題の解法
class cavity2d:
    def __init__(self, dt, nlattice, Re, u0):
        self.dt = dt
        self.nlattice = nlattice
        self.d = 1 / nlattice
        self.Re = 100
        self.u0 = u0
        self.t = 0.0
        self.niter = 2000
        self.maxerr = 0.01

        # 物理量
        self.p = torch.zeros((nlattice + 2, nlattice + 2), dtype=torch.float64).cuda()
        self.u = torch.zeros((nlattice + 2, nlattice + 3), dtype=torch.float64).cuda()
        self.v = torch.zeros((nlattice + 3, nlattice + 2), dtype=torch.float64).cuda()

        # 補助量
        self.large_d = torch.zeros((nlattice, nlattice), dtype=torch.float64).cuda()
        self.p_rhs = torch.zeros((nlattice, nlattice), dtype=torch.float64).cuda()
        self.p_lhs = torch.zeros((nlattice, nlattice), dtype=torch.float64).cuda()
        self.u_rhs = torch.zeros((nlattice - 1, nlattice - 1), dtype=torch.float64).cuda()
        self.v_rhs = torch.zeros((nlattice - 1, nlattice - 1), dtype=torch.float64).cuda()
        self.ua = torch.zeros((nlattice - 1, nlattice - 1), dtype=torch.float64).cuda()
        self.ub = torch.zeros((nlattice, nlattice), dtype=torch.float64).cuda()
        self.uc = torch.zeros((nlattice, nlattice), dtype=torch.float64).cuda()
        self.va = torch.zeros((nlattice - 1, nlattice - 1), dtype=torch.float64).cuda()
        self.vb = torch.zeros((nlattice, nlattice), dtype=torch.float64).cuda()
        self.vc = torch.zeros((nlattice, nlattice), dtype=torch.float64).cuda()

    # 平均量の計算
    def average(self, z, n, c):
        return (z[2+c[0]:n+c[0], 2+c[1]:n+c[1]] \
               +z[3+c[0]:n+1+c[0], 2+c[1]:n+c[1]] \
               +z[2+c[0]:n+c[0], 3+c[1]:n+1+c[1]] \
               +z[3+c[0]:n+1+c[0], 3+c[1]:n+1+c[1]]) / 4.0

    # Laplace 演算子
    def laplace(self, z, x_bd, y_bd):
        return z[x_bd[0]+1:x_bd[1]+1,y_bd[0]:y_bd[1]] +z[x_bd[0]-1:x_bd[1]-1,y_bd[0]:y_bd[1]] +z[x_bd[0]:x_bd[1],y_bd[0]-1:y_bd[1]-1] +z[x_bd[0]:x_bd[1],y_bd[0]+1:y_bd[1]+1]

    # 圧力の境界条件
    def boundary_p(self):
        # 辺 CD
        self.p[self.nlattice+1, :] = self.p[self.nlattice, :] - (self.v[self.nlattice - 2, :] - 2.0 * self.v[self.nlattice - 1, :]) / self.d / self.Re
        # 辺 DA
        self.p[:, 0] = self.p[:, 1] + (self.u[:, 3] - 2.0 * self.u[:, 2]) / self.d / self.Re
        # 辺 AB
        self.p[0, :] = self.p[1, :] + (self.v[3, :] - 2.0 * self.v[2, :]) / self.d / self.Re
        # 辺 BC
        self.p[:, self.nlattice+1] = self.p[:, self.nlattice] - (self.u[:, self.nlattice - 2] - 2.0 * self.u[:, self.nlattice - 1]) / self.d / self.Re


    # u, v の境界条件
    def boundary_uv(self):
        # 辺 CD
        self.v[self.nlattice+2, :] = self.v[self.nlattice, :]
        self.v[self.nlattice+1, :] = 0
        self.u[self.nlattice+1, :] = self.u0 / 4.0 - self.u[self.nlattice, :]
        # 辺 DA
        self.u[:, 0] = self.u[:, 2]
        self.u[:, 1] = 0
        self.v[:, 0] = -self.v[:, 1]

        # 辺 AB
        self.v[0, :] = self.v[2, :]
        self.v[1, :] = 0
        self.u[0, :] = -self.u[1, :]

        # 辺 BC
        self.u[:, self.nlattice+2] = self.u[:, self.nlattice]
        self.u[:, self.nlattice+1] = 0
        self.v[:, self.nlattice+1] = -self.v[:, self.nlattice]

    # 連続の式
    def cont_eq(self):
        self.large_d[:,:] = (self.v[2:self.nlattice+2,1:self.nlattice+1] - self.v[1:self.nlattice+1,1:self.nlattice+1] \
            + self.u[1:self.nlattice+1,2:self.nlattice+2] - self.u[1:self.nlattice+1,1:self.nlattice+1]) / self.d

    # ua, ub, va, vb の計算
    def calc_uv(self):
        self.ua[:,:] = self.average(self.u, self.nlattice + 1, (0, 0))
        self.ub[:,:] = self.average(self.u, self.nlattice + 2, (-1, -1))
        self.uc[:,:] = self.average(self.u, self.nlattice + 2, (-2, -1))
        self.va[:,:] = self.average(self.v, self.nlattice + 1, (0, 0))
        self.vb[:,:] = self.average(self.v, self.nlattice + 2, (-1, -1))
        self.vc[:,:] = self.average(self.v, self.nlattice + 2, (-1, -2))

    # 圧力右辺の計算 (poisson 項除く)
    def calc_p_rhs(self):
        self.p_rhs[:,:] = -self.d ** 2 / self.dt * self.large_d \
            +(self.u[1:self.nlattice+1, 2:self.nlattice+2] - self.u[1:self.nlattice+1, 1:self.nlattice+1]) ** 2 \
            +(self.v[2:self.nlattice+2, 1:self.nlattice+1] - self.v[1:self.nlattice+1, 1:self.nlattice+1]) ** 2 \
            +2.0 * (self.ub[:,:] - self.uc[:,:]) * (self.vb[:,:] - self.vc[:,:])

    # 圧力の poisson 方程式を Gauss 法で解く
    def solve_p(self):
        self.boundary_uv()
        self.boundary_p()
        self.calc_uv()
        self.calc_p_rhs()
        err = self.maxerr
        for _ in range(self.niter):
            self.p_lhs[:,:] = (self.laplace(self.p, (1, self.nlattice+1), (1, self.nlattice+1)) + self.p_rhs[:,:]) / 4.0
            err = torch.max(torch.abs(self.p[1:self.nlattice+1,1:self.nlattice+1] - self.p_lhs[:,:]))
            self.p[1:self.nlattice+1,1:self.nlattice+1] = self.p_lhs[:,:]
            if err < self.maxerr:
                break
        if err >= self.maxerr:
            print("{:.4f}: err not converged:".format(self.t), err)

    # u,v を時間発展させる
    def evolve_uv(self):
        self.u_rhs[:,:] = self.u[2:self.nlattice+1,2:self.nlattice+1] * (self.u[2:self.nlattice+1, 3:self.nlattice+2] - self.u[2:self.nlattice+1, 1:self.nlattice+0]) / 2.0 / self.d \
          + self.va[:,:] * (self.v[3:self.nlattice+2, 2:self.nlattice+1] - self.v[1:self.nlattice+0, 2:self.nlattice+1]) / 2.0 / self.d \
          + (self.p[2:self.nlattice+1, 3:self.nlattice+2] - self.p[2:self.nlattice+1,2:self.nlattice+1]) / self.d \
          - (self.laplace(self.u, (2, self.nlattice+1), (2, self.nlattice+1)) - 4.0 * self.u[2:self.nlattice+1,2:self.nlattice+1]) / (self.d ** 2) / self.Re
        self.u_rhs[:,:] = self.u[2:self.nlattice+1,2:self.nlattice+1] - self.u_rhs[:,:] * self.dt

        self.v_rhs[:,:] = self.v[2:self.nlattice+1,2:self.nlattice+1] * (self.v[3:self.nlattice+2,2:self.nlattice+1] - self.v[1:self.nlattice+0,2:self.nlattice+1]) / 2.0 / self.d \
          + self.ua[:,:] * (self.u[2:self.nlattice+1,3:self.nlattice+2] - self.u[2:self.nlattice+1,1:self.nlattice+0]) / 2.0 / self.d \
          + (self.p[3:self.nlattice+2, 2:self.nlattice+1] - self.p[2:self.nlattice+1,2:self.nlattice+1]) / self.d \
          - (self.laplace(self.v, (2, self.nlattice+1), (2, self.nlattice+1)) - 4.0 * self.v[2:self.nlattice+1,2:self.nlattice+1]) / (self.d ** 2) / self.Re
        self.v_rhs[:,:] = self.v[2:self.nlattice+1,2:self.nlattice+1] - self.v_rhs[:,:] * self.dt

        # 計算結果を代入
        self.u[2:self.nlattice+1, 2:self.nlattice+1] = self.u_rhs[:,:]
        self.t += self.dt

if __name__ == '__main__':
    nlattice = 30
    ntime = 35000
    if len(sys.argv) > 1:
        nlattice = int(sys.argv[1])
    if len(sys.argv) > 2:
        ntime = int(sys.argv[2])
    c2d = cavity2d(dt = 0.001, nlattice=nlattice, Re = 100, u0 = 1.0)
    st = datetime.datetime.now()
    print("start calculation")
    for _ in range(ntime):
        c2d.solve_p()
        c2d.evolve_uv()
    ed = datetime.datetime.now()
    print("end calculation")
    d = ed - st
    print(d.seconds + (d.microseconds / 10000000.))
    x, y = np.meshgrid(np.arange(0, nlattice + 1), np.arange(0, nlattice + 1))
    u = c2d.u[1:nlattice+2,1:nlattice+2].cpu() * 15.0
    v = c2d.v[1:nlattice+2,1:nlattice+2].cpu() * 15.0
    pmin = torch.min(torch.abs(c2d.p))
    pmax = torch.max(torch.abs(c2d.p) - pmin)
    pcol = (torch.abs(c2d.p) - pmin) / pmax
    print(pcol)
    plt.rc('font', size=22)
    plt.xlim(0, nlattice)
    plt.ylim(0, nlattice)
    fig, ax = plt.subplots(figsize=(20, 10))
    ax.imshow(pcol.cpu().numpy(), origin='lower')
    ax.quiver(x, y, u, v, units='xy', scale=1, color='black')
    ax.grid()
    plt.title('fluid velocities')
    plt.savefig('cavity2d.png')
投稿者について
みのしす

小さいときは科学者になろうとしたのに、その時にたまたま身に着けたプログラミングで未だに飯を食っているしがないおじさんです。(年齢的にはもうすぐおじいさん)

1 throughts on "意外に遅い? RTX4090 の fp64 性能"

コメントを残す

メールアドレスが公開されることはありません。 が付いている欄は必須項目です