facebookresearch / convnext-v2 Goto Github PK

Code release for ConvNeXt V2 model

License: Other

Python 100.00%

convnext-v2's Issues

How about the log files of pretraining?

Hi, could anybody share the log file of pretraining? I use this great work to my own task, but I don't know what should be the correct range of loss as well as the training speed. Thanks!

Problem with ConvNeXt V2-B 384x384 weights

I'm using ConvNeXt V2 as backbone of Mask2Former to train my custom dataset.

import torch
import torch.nn as nn
import torch.nn.functional as F
from timm.models.layers import trunc_normal_, DropPath

from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec


class LayerNorm(nn.Module):
    """ LayerNorm that supports two data formats: channels_last (default) or channels_first. 
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with 
    shape (batch_size, height, width, channels) while channels_first corresponds to inputs 
    with shape (batch_size, channels, height, width).
    """
    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(normalized_shape))
        self.bias = nn.Parameter(torch.zeros(normalized_shape))
        self.eps = eps
        self.data_format = data_format
        if self.data_format not in ["channels_last", "channels_first"]:
            raise NotImplementedError 
        self.normalized_shape = (normalized_shape, )
    
    def forward(self, x):
        if self.data_format == "channels_last":
            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
        elif self.data_format == "channels_first":
            u = x.mean(1, keepdim=True)
            s = (x - u).pow(2).mean(1, keepdim=True)
            x = (x - u) / torch.sqrt(s + self.eps)
            x = self.weight[:, None, None] * x + self.bias[:, None, None]
            return x

class GRN(nn.Module):
    """ GRN (Global Response Normalization) layer
    """
    def __init__(self, dim):
        super().__init__()
        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))

    def forward(self, x):
        Gx = torch.norm(x, p=2, dim=(1,2), keepdim=True)
        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
        return self.gamma * (x * Nx) + self.beta + x


class Block(nn.Module):
    """ ConvNeXtV2 Block.
    
    Args:
        dim (int): Number of input channels.
        drop_path (float): Stochastic depth rate. Default: 0.0
    """
    def __init__(self, dim, drop_path=0.):
        super().__init__()
        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
        self.norm = LayerNorm(dim, eps=1e-6)
        self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
        self.act = nn.GELU()
        self.grn = GRN(4 * dim)
        self.pwconv2 = nn.Linear(4 * dim, dim)
        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()

    def forward(self, x):
        input = x
        x = self.dwconv(x)
        x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
        x = self.norm(x)
        x = self.pwconv1(x)
        x = self.act(x)
        x = self.grn(x)
        x = self.pwconv2(x)
        x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)

        x = input + self.drop_path(x)
        return x

@BACKBONE_REGISTRY.register()
class ConvNeXtV2(Backbone):
    """ ConvNeXt V2
        
    Args:
        in_chans (int): Number of input image channels. Default: 3
        num_classes (int): Number of classes for classification head. Default: 1000
        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
        drop_path_rate (float): Stochastic depth rate. Default: 0.
        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
    """
    def __init__(self, cfg, input_shape):
        super().__init__()
        in_chans = cfg.MODEL.CONVNEXTV2.IN_CHANS
        depths = cfg.MODEL.CONVNEXTV2.DEPTHS
        dims = cfg.MODEL.CONVNEXTV2.DIMS
        drop_path_rate = cfg.MODEL.CONVNEXTV2.DROP_PATH_RATE
        self.depths = depths
        self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers
        stem = nn.Sequential(
            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
            LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
        )
        self.downsample_layers.append(stem)
        for i in range(3):
            downsample_layer = nn.Sequential(
                    LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
                    nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2),
            )
            self.downsample_layers.append(downsample_layer)

        self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks
        dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] 
        cur = 0
        for i in range(4):
            stage = nn.Sequential(
                *[Block(dim=dims[i], drop_path=dp_rates[cur + j]) for j in range(depths[i])]
            )
            self.stages.append(stage)
            cur += depths[i]
            
        self._out_features = cfg.MODEL.CONVNEXTV2.OUT_FEATURES
        self._out_feature_strides = {
            "res2": 4,
            "res3": 8,
            "res4": 16,
            "res5": 32,
        }
        self._out_feature_channels = {
            "res2": dims[0],
            "res3": dims[1],
            "res4": dims[2],
            "res5": dims[3],
        }


        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, (nn.Conv2d, nn.Linear)):
            trunc_normal_(m.weight, std=.02)
            nn.init.constant_(m.bias, 0)

    def forward(self, x):
        out = {}
        for i in range(4):
            x = self.downsample_layers[i](x)
            x = self.stages[i](x)
            out["res{}".format(i+2)] = x

        return out

    def output_shape(self):
        return {
            name: ShapeSpec(
                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
            )
            for name in self._out_features
        }

I download ConvNeXt V2-L 384x384 weights, and delete "head.bias", "head.weight", "norm.bias", "norm.weight" because for backbone I'm not using these weights. Training loss continuously decrease and everything goes well.
But when i shift to ConvNeXt V2-B 384x384 weights. Training loss became super unstable, and barely decrease.

I'm also using some other custom backbone, and nothing like this happened.

cd MinkowskiEngine报错，没有这个目录文件，是咋回事

完全按照教程操作的，但是到cd MinkowskiEngine这一步报错，说是没有这个目录文件

License change from V1

The first ConvNext has an MIT license. Might ConvNext V2 be open sourced in the future? It's sad to see licensing changes between V1 and V2.

ImageNet-1K Benchmarks (Trained from scratch)

Nice work ! based on the benchmarks in the paper and also this repository, it seems that all ImageNet-1K benchmarks use self-supervised pre-trained checkpoints. If this is the case, I would like to ask if authors can provide ImageNet-1K benchmarks for models that are trained from scratch. This enables direct comparison to other models which don't use self-supervised pre-training which constitute a large fraction of the literature.

The official pre-training weights does not match the parameters of the model

The official re-training weights:
model: OrderedDict
downsample_layers.0.0.bias: Tensor with shape (128,)
downsample_layers.0.0.weight: Tensor with shape (128, 3, 4, 4)
downsample_layers.0.1.bias: Tensor with shape (128,)
downsample_layers.0.1.weight: Tensor with shape (128,)
downsample_layers.1.0.bias: Tensor with shape (128,)
downsample_layers.1.0.weight: Tensor with shape (128,)
downsample_layers.1.1.bias: Tensor with shape (256,)
downsample_layers.1.1.weight: Tensor with shape (256, 128, 2, 2)
downsample_layers.2.0.bias: Tensor with shape (256,)
downsample_layers.2.0.weight: Tensor with shape (256,)
downsample_layers.2.1.bias: Tensor with shape (512,)
downsample_layers.2.1.weight: Tensor with shape (512, 256, 2, 2)
downsample_layers.3.0.bias: Tensor with shape (512,)
downsample_layers.3.0.weight: Tensor with shape (512,)
downsample_layers.3.1.bias: Tensor with shape (1024,)
downsample_layers.3.1.weight: Tensor with shape (1024, 512, 2, 2)
.....
The model parameters:
mask_token: Tensor with shape (1, 512, 1, 1)
encoder.downsample_layers.0.0.weight: Tensor with shape (128, 3, 4, 4)
encoder.downsample_layers.0.0.bias: Tensor with shape (128,)
encoder.downsample_layers.0.1.weight: Tensor with shape (128,)
encoder.downsample_layers.0.1.bias: Tensor with shape (128,)
encoder.downsample_layers.1.0.ln.weight: Tensor with shape (128,)
encoder.downsample_layers.1.0.ln.bias: Tensor with shape (128,)
encoder.downsample_layers.1.1.kernel: Tensor with shape (4, 128, 256)
encoder.downsample_layers.1.1.bias: Tensor with shape (1, 256)
encoder.downsample_layers.2.0.ln.weight: Tensor with shape (256,)
encoder.downsample_layers.2.0.ln.bias: Tensor with shape (256,)
encoder.downsample_layers.2.1.kernel: Tensor with shape (4, 256, 512)
encoder.downsample_layers.2.1.bias: Tensor with shape (1, 512)
encoder.downsample_layers.3.0.ln.weight: Tensor with shape (512,)
encoder.downsample_layers.3.0.ln.bias: Tensor with shape (512,)
encoder.downsample_layers.3.1.kernel: Tensor with shape (4, 512, 1024)
encoder.downsample_layers.3.1.bias: Tensor with shape (1, 1024)

On masking input images

Dear Author,

In paper, you mentioned that masking is done on the raw images. However, in put code, masking is only done after the stem layer. Can you explain the inconsistency? Thank you!

trainning is too slow compared with MAE+VIT-Base

good work，thank you.
but the training is very slow when trainning convnextv2_base，the GPU-Utilization is only about 50%，is there any solutions？thanks

ConvNeXtV2 TensorFlow implementation

I have implemented ConvNeXtV2 in TensorFlow 2.X:
https://github.com/edwardyehuang/iSeg/blob/master/backbones/convnext_v2.py

The weights can be found here:
https://github.com/edwardyehuang/iSeg/tree/master/backbones

This implementation is compatible with TPU, determinism, and mixed precision on TensorFlow. It also has been tested on downstream tasks.

Is it difficult to train/finetune ConvNeXtv2 compared with ConvNeXtv1?

Dear authors,
I have played around both ConvNeXt v1 and yours using TIMM codebase with my own datasets.
Using V1 I don't struggle with training/finetuning for my datasets and am pleasure with my obtained overall performance for TIMM's variants.
However, I can not achieve any comparative performance (overall accuracy as well as computed costs, of course) using your V2 variants with regarding every pretrained weights.

Can you give me any tip, trick, or treat for a set of your hyperparameters?

Thank in advance.
Linh

Inplementation mismatch with the paper

Great work! I try to use the GRN in another model, but when I check the source code, I find one mismatch. In papaer, equation 2 uses the sum of gx as the denominator, but in this implementation, GRN module uses the mean operator. Don't whether this will affect the final result.

parameter shape mismatch in GRN

ConvNeXt-V2/models/utils.py

Lines 16 to 23 in 3608f67

 class MinkowskiGRN(nn.Module): 

 """ GRN layer for sparse tensors. 

  """ 

 def __init__(self, dim): 

 super().__init__() 

 self.gamma = nn.Parameter(torch.zeros(1, dim)) 

 self.beta = nn.Parameter(torch.zeros(1, dim))

ConvNeXt-V2/models/utils.py

Lines 105 to 112 in 3608f67

 class GRN(nn.Module): 

 """ GRN (Global Response Normalization) layer 

  """ 

 def __init__(self, dim): 

 super().__init__() 

 self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim)) 

 self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))

Both shape should be ok, but may cause trouble in finetuning (GRN parameter cannot be loaded)

Training Setting for ConvNeXt V1-A Supervised (75.7%) in Table 14

Hi! Really nice work!

I wonder what is the sup. training setting for ConvNeXt V1-A Supervised (75.7%) in Table 14. Is it also 300 epochs, the same as ConvNeXt -Tiny from the ConvNeXt V1 paper?

If not, could you share the sup. training setting for atto, pico, and nano models?

segmentation

May I know when the semantic_segmentation version of Convnextv2 will be released

How to use pretrained model in coco detection

I modify the code of convnextv1 in mmdet, including adding GRN and deleting LayerScale.
I use the checkpoint "convnextv2_base_1k_224_fcmae.pt" to initialize model weights, and I also use "remap_checkpoint_keys" to reshape grn affine parameters and biases.
No error was reported when loading checkpoint, but after 1 epoch, I only get about 1m AP in coco.

How to get the predicted results of the test set?

Question regarding sparse convolution dimension

Hello. Thank you for publishing the implementation of your exciting work!

I have a question regarding usage of sparse convolutional modules from MinkowskiEngine.
You use dimension=3 both in depthwise and strided convolutions. Why is this the case since images only consist of 2 spatial dimensions?
Thank you for providing insights around this :)

Reproduction of segmentation networks

Thank you for your outstanding work. If I want to train a segmentation network, how can I modify the code?

unable to install apex

Does the apex in the INSTALL.md have any function?

I have tried many different methods to install the content of "pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./", but they are all not working.

The error is "no module named 'packaging'" error: subprocess-exited-with-error

Without this apex, it seems to be also able to fine-tune the model. Could anyone help me explain the function of apex, or help me install it?

Thanks very very much for any help!

cannot import name 'MinkowskiDepthwiseConvolution' from 'MinkowskiEngine'.

Hello, great work.
In Install.md it says: (Note: we have implemented a customized CUDA kernel for depth-wise convolutions, which the original MinkowskiEngine does not support.)
but in the code, it still has from MinkowskiEngine import MinkowskiDepthwiseConvolution
So what do you mean by that?
I found that there is MinkowskiChannelwiseConvolution.py in MinkowskiEngine, is it possible to replace MinkowskiDepthwiseConvolution in the code with MinkowskiChannelwiseConvolution, or some another solution? I hope I can get your answer, thanks!

Translated with www.DeepL.com/Translator (free version)

Could you please provide a demo code for how to visualize the effect of mask decoder, such as the following figure?

as the title, thanks!

I can just generate the following figure using the MAE code, and can not find out where to fix for ConvNext-V2.

Hello, I found some bugs

In the subproject MinkowskiEngine, the three c++ codes src/depthwise_convolution_gpu.cu, depthwise_convolution_kernel.cu, depthwise_convolution_kernel.cuh are commented with # at the beginning, which will cause compilation failure.

How can I reproduce the supervised training results?

I try to reproduce the results that use supervised training from scratch. However, I cannot reproduce the results reported in the paper. Specifically, for the base model, the reported top-1 ACC is 84.3 (Table 14). However, my reproduced results are 77.7 (100 ep) and 80.6 (300ep). See the logs here.

By the way, can you release the pretraining/finetuning logs for our reference and comparison? It is too expensive for us to reproduce these results.

Related issue: #37

Question on decoder resolution

ConvNeXt-V2/models/fcmae.py

Line 186 in 2553895

def forward(self, imgs, labels=None, mask_ratio=0.6):

The above function implements the MAE training logic. If I understand correctly, x after self.forward_encoder() should have been downsampled 4 times after patchification. See

ConvNeXt-V2/models/convnextv2_sparse.py

Line 122 in 2553895

def forward(self, x, mask):

Then, pred should have the same spatial dimension as x. self.proj and self.decoder preserve spatial dimensions. self.pred only recovers the in-patch dimensions, preserving the number of patches.

Overall, pred should have 4^4 times fewer patches than self.patchify(imgs). However, they are directly put in a subtraction in

ConvNeXt-V2/models/fcmae.py

Line 164 in 2553895

def forward_loss(self, imgs, pred, mask):

What am I missing here?

Essentially, how do you recovered the downsampled number of patches if you decode from the lowest-resolution encodings?

About the size of the patches

Dear authors,

I tried to change the size of the patches from 32x32 to 16x16. When decreasing the size of the patches, there is an error in the upsample_mask function in sparse_convnextv2.py file. This error is related to the size of the upsampled mask and the downsampled features. When decreasing the size of the patches, should we change the stride of the stem or add more layers in the encoding part? This is not very clear for me.

Best,
Colin Decourt

Could you release a pre-trained model with a decoder?

Hi, the work you've done is amazing, just like MAE. However, I found that the pre-trained model with decoder is missing when I was using main_pretrain. May I ask if you could release it? Thank you

self-supervised pre-trained weights miss Parameters

Hi，Thank you for open source. self-supervised pre-trained weights miss Parameter. The model parameters are incomplete. The parameter name has been changed

The model used contains deepcopy, and a TypeError: cannot pickle 'MinkowskiConvolutionFunction' object error was reported when using this project

Describe the bug
I installed it according to the installation steps. I used ema in my project, which used deepcopy code, and this error occurred when the project was running.
“TypeError: cannot pickle 'MinkowskiConvolutionFunction' object”

Expected behavior
The project is running normally

Desktop (please complete the following information):

OS: [e.g. Ubuntu 18.04]
Python version: [e.g. 3.8.13]
Pytorch version: [e.g. 1.10.1]
CUDA version: [e.g. 11.3]
NVIDIA Driver version: [e.g. 510.60]
Minkowski Engine version [e.g. 0.5.4]

cd MinkowskiEngine reported an error

cd MinkowskiEngine
python setup.py install --blas_include_dirs=${CONDA_PREFIX}/include --blas=openblas
reported an error：

/ConvNeXt-V2/MinkowskiEngine/src/3rdparty/concurrent_unordered_map.cuh(503): error: namespace "thrust" has no member "device"

1 error detected in the compilation of "/home/mjddh/ConvNeXt-V2/MinkowskiEngine/src/broadcast_gpu.cu".
error: command '/usr/local/cuda-12.0/bin/nvcc' failed with exit code 2

The reconstruction images have a lot of noise

The following image is the result of epoch=69.

The reconstruction of unmasked part have lots of noise, is this a normal phenomenon？

And I have found a strange problem. If i replace the first conv in the stem with MinkowskiDepthwiseConvolution, the training loss will quickly drop to 0. I can't understand why this is happening.

Reason for double mask in Jax implementation?

While reading the TRAINING.md I noticed that the mask is applied twice in the Jax's Block implementation:

@nn.compact
  def __call__(self, inputs, mask=None):
    if mask is not None:
      x = inputs * (1. - mask)
    x = DepthwiseConv2D((7, 7), name='dwconv')(x)
    if mask is not None: # The binary masking is numerically identical to sparse conv.
      x = x * (1.- mask)
    (...)

What benefit does this bring? Couldn't the mask be only applied to the output with the same result? Aka something like:

@nn.compact
  def __call__(self, inputs, mask=None):
    x = DepthwiseConv2D((7, 7), name='dwconv')(x)
    if mask is not None: # The binary masking is numerically identical to sparse conv.
      x = x * (1.- mask)

I would greatly appreciate some hint.

LayerNorm vs torch.nn.LayerNorm

Hello,

I just want to ask, what is the functional difference between the implemented layernorm vs layernorm in pytorch? (not the sparse layer norm).

Would using layernorm from pytorch impact performance?

No key named 'lr_scale' as input parameter to optimizer.AdamW

Hi, I cant find the lr_scale parameter in the function description for torch.optim.AdamW. Is this a typo or did you mean to pass the parameter groups with lr instead of lr_scale. Shown below is the snippet from optim_factory.py lines 123-132 :

        parameter_group_names[group_name] = {
            "weight_decay": this_weight_decay,
            "params": [],
            "lr_scale": scale
        }
        parameter_group_vars[group_name] = {
            "weight_decay": this_weight_decay,
            "params": [],
            "lr_scale": scale
        }

visualization and feature consine distance

Could you provide the visualization code and how to compute feature consine distance? Thank you very much for your help.

Do you consider comparing the scores of Cityscapes in the future to facilitate the data disclosure

image reconstruction result comparision of the pretrained ConvNeXt-V1/V2 model

@shwoo93 @s9xie hi, did you make a comparision about the effect of image reconstruction of the pretrained V1/V2 model? How about comparing with MAE-pretrained ViT?

can we have segmentation code?

Parameter Settings about COCO Detection

In the paper it was writen that "sweep layer-wise learning rate decay in {0.9, 0.95}, stochastic depth rate in {0.2, 0.3, 0.4, 0.5}". I'd like to know which of them are your recommended parameters with ConvNeXt-V2 Base Mask-RCNN in COCO task.

TF/Keras-based Implementation

Amazing work and an absolutely lovely read to start 2023 with :)

Did a quick implementation in Keras/TF.
Should add SparseConvNeXtV2 and the masked autoencoder training framework.

Basic architecture available at: https://github.com/DavidLandup0/keras-cv/tree/convnextv2

What is the loss of the pretrain model in fcmae stage?

Is the gamma and beta initial value wrong?Does zero work？

class GRN(nn.Module):
""" GRN (Global Response Normalization) layer
"""
def init(self, dim):
super().init()
self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))

def forward(self, x):
Gx = torch.norm(x, p=2, dim=(1,2), keepdim=True)
Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
return self.gamma * (x * Nx) + self.beta + x

这个文件夹MinkowskiEngine是空的，没有setup文件

MinkowskiLinear is missing from MinkowskiEngine, MinkowskiGELU code files

custom LayerNorm

What is the difference between the custom LayerNorm and BN、LN（pytorch） in the code。

Unable to update submodule (publickey error)

Following the instructions listed in install.md
Under 'Install Minkowski Engine'

I assume there's some permissions that must be changed ?
Thanks

Will the methods work on other CNNs?

Hello authors,

I'm so excited to see that MAE can also work on ConvNets! I was wondering that are the sparseconv MAE and GRN architecture-agnostic that they can also work on other CNNs like a vanilla ResNet without the ConvNeXt tricks? Have you done any experiments on this? Hope you can share some insights.

Thanks!

can we have the segmentation code?

Efficiency comparison between dense conv + binary masking and sparse conv

What's the efficiency comparison between

dense conv + binary masking
sparse conv with MinkowskiEngine

Specifically, I'd like to know the following statistics in the pertaining stage for the two variants:

hardware: [e.g. single 8 V100 node]
Batch time: [e.g. 0.8s (sparse) / 0.9s (dense) ]

As far as I know, sparse operators are much less efficient in modern hardware. According to Nvidia blog:

Even though sparse linear algebra allows representing huge matrices very efficiently, it typically does not provide competitive performance compared to dense counterparts in cases when sparsity is below 95%. This is due to irregular computation and scattered memory accesses. In fact, many of the linear algebra applications that benefit from sparsity have over 99% sparsity in their matrices.

Even a highly optimized block-sparse kernel with a strict assumption on the sparse pattern still requires 40%~50% sparsity to counteract the hardware inefficiency. Hence, as far as I'm concerned, a 60% mask rate (i.e. 60% sparsity) is not enough to accelerate the pertaining with sparse convolution, if it is not slower than dense convolution + binary masking.

Visualization Question

Hi, very interesting and nice work! Thanks a lot!

I wonder the implementation of the "Feature collapse" visualization.

Would you mind share the related visualization code?

Any response is appreciated.

RuntimeError: Error(s) in loading state_dict for ConvNeXtV2:

criterion = LabelSmoothingCrossEntropy()
Traceback (most recent call last):
File "main_finetune.py", line 437, in
main(args)
File "main_finetune.py", line 351, in main
optimizer=optimizer, loss_scaler=loss_scaler, model_ema=model_ema)
File "/media/wangshuang/sdb/object_class/ConvNeXt-V2/utils.py", line 495, in auto_load_model
model_without_ddp.load_state_dict(checkpoint['model'])
File "/home/wangshuang/anaconda3/envs/yolov8/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1224, in load_state_dict
self.class.name, "\n\t".join(error_msgs)))
RuntimeError: Error(s) in loading state_dict for ConvNeXtV2:
Missing key(s) in state_dict: "downsample_layers.0.0.weight",

	class MinkowskiGRN(nn.Module):
	""" GRN layer for sparse tensors.
	"""
	def __init__(self, dim):
	super().__init__()
	self.gamma = nn.Parameter(torch.zeros(1, dim))
	self.beta = nn.Parameter(torch.zeros(1, dim))

	class GRN(nn.Module):
	""" GRN (Global Response Normalization) layer
	"""
	def __init__(self, dim):
	super().__init__()
	self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
	self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))

facebookresearch / convnext-v2 Goto Github PK

convnext-v2's Issues

Recommend Projects

Recommend Topics

Recommend Org

Jobs