The gmfss from hyw-dev

Request: fp16 inference

This repository is amazing, I really like the results, but the vram usage is quite high when I try to use it with vapoursynth. 512x512px needs around 15gb VRAM with a real video. The speed is not bad, but I struggle to get some sort of usable resolution. For some reason with your test.py it needs much less vram. When I try to use the model class directly, for some reason it needs a lot more memory.

When i use test.py with a loop and also 512x512:

from tqdm import tqdm
for i in tqdm(range(1000)):
  result = make_inference(I0, I1, n_frames, scale, pred_bidir_flow)

class Model_inference(nn.Module):
    def __init__(self):
        super(Model_inference, self).__init__()
        self.model = Model()

    def forward(self, I0, I1):
        n_frames = 1
        scale = 1.0  # flow scale
        timesteps = [i / (n_frames + 1) for i in range(1, n_frames + 1)]

        # padding frames
        n, c, h, w = I0.shape
        tmp = max(32, int(32 / scale))
        ph = ((h - 1) // tmp + 1) * tmp
        pw = ((w - 1) // tmp + 1) * tmp
        padding = (0, pw - w, 0, ph - h)
        I0 = F.pad(I0, padding)
        I1 = F.pad(I1, padding)        
        return self.model.inference(I0, I1, timesteps, scale, pred_bidir_flow=False)[0][:h, :w, :]


model = Model_inference()
model.eval().cuda()

test_input = torch.rand(1,3,720,1280).cuda()

out = model(test_input, test_input)
print(out.shape)


from tqdm import tqdm
for i in tqdm(range(1000)):
  out = model(test_input, test_input)

720p is not possible.

512x512 is the highest i can do with vapoursynth for now.

I tried to add fp16 a bit but I didn't get far yet. With fp16 it should use a lot less vram.

import itertools
import numpy as np
import vapoursynth as vs
import functools
from pytorch_msssim import ssim, ms_ssim, SSIM, MS_SSIM

import torch 
device = torch.device("cuda")
torch.set_grad_enabled(False)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

def make_inference(model, I0, I1, n, scale, pred_bidir_flow=False):
    timesteps = [i / (n + 1) for i in range(1, n + 1)]
    return model.inference(I0, I1, timesteps, scale, pred_bidir_flow)

# https://github.com/HolyWu/vs-rife/blob/master/vsrife/__init__.py
def GMFSS(
    clip: vs.VideoNode,
    fp16: bool = True,
) -> vs.VideoNode:

    core = vs.core
    from .GMFSS_arch import Model
    import torch

    n_frames = 1
    scale = 1.0  # flow scale
    pred_bidir_flow = False  # Estimate bilateral optical flow at once (accelerate)

    device = torch.device("cuda")
    torch.set_grad_enabled(False)
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True

    model = Model()
    model.load_model("/workspace/tensorrt/VSGAN-tensorrt-docker/weights/", -1)
    model.eval()
    model.device("cuda")

    w = clip.width
    h = clip.height
    scale_list = [8 / scale, 4 / scale, 2 / scale, 1 / scale]


    def frame_to_tensor(frame: vs.VideoFrame):
        return np.stack(
            [np.asarray(frame[plane]) for plane in range(frame.format.num_planes)]
        )

    def tensor_to_frame(f: vs.VideoFrame, array) -> vs.VideoFrame:
        for plane in range(f.format.num_planes):
            d = np.asarray(f[plane])
            np.copyto(d, array[plane, :, :])
        return f

    def tensor_to_clip(clip: vs.VideoNode, image) -> vs.VideoNode:
        clip = core.std.BlankClip(
            clip=clip, width=image.shape[-1], height=image.shape[-2]
        )
        return core.std.ModifyFrame(
            clip=clip,
            clips=clip,
            selector=lambda n, f: tensor_to_frame(f.copy(), image),
        )

    def execute(n: int, clip: vs.VideoNode) -> vs.VideoNode:
        if (
            (n % 2 == 0)
            or n == 0
            or n in skip_framelist
            or n == clip.num_frames - 1
        ):
            return clip

        I0 = frame_to_tensor(clip.get_frame(n - 1))
        I1 = frame_to_tensor(clip.get_frame(n + 1))

        I0 = torch.Tensor(I0).unsqueeze(0).to("cuda", non_blocking=True)
        I1 = torch.Tensor(I1).unsqueeze(0).to("cuda", non_blocking=True)
        middle = make_inference(model, I0, I1, n_frames, scale, pred_bidir_flow)[0]
        middle = middle.swapaxes(0, 2).swapaxes(1, 2)/255

        return tensor_to_clip(clip=clip, image=middle)

    clip = core.std.Interleave([clip, clip])
    return core.std.FrameEval(
        core.std.BlankClip(clip=clip, width=clip.width, height=clip.height),
        functools.partial(execute, clip=clip),
    )

I can't figure out why your test.py needs less vram.

Update:
Right after writing this I figured out that

torch.set_grad_enabled(False)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

decreases vram usage a lot, leaving it here in case someone else will run into it. Sadly it does not in my vapoursynth script. My request for fp16 is still valid though.

Update2:
After a lot of trying I figured out that

with torch.inference_mode():

does the job as well, and uses even less vram than torch.backends.cudnn.

hyw-dev / gmfss Goto Github PK

gmfss's People

Contributors

Stargazers

Watchers

Forkers

gmfss's Issues

Request: fp16 inference

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent

Jobs