facebookresearch / convnext-v2 Goto Github PK
View Code? Open in Web Editor NEWCode release for ConvNeXt V2 model
License: Other
Code release for ConvNeXt V2 model
License: Other
Hi, could anybody share the log file of pretraining? I use this great work to my own task, but I don't know what should be the correct range of loss as well as the training speed. Thanks!
I'm using ConvNeXt V2 as backbone of Mask2Former to train my custom dataset.
import torch
import torch.nn as nn
import torch.nn.functional as F
from timm.models.layers import trunc_normal_, DropPath
from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
class LayerNorm(nn.Module):
""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
shape (batch_size, height, width, channels) while channels_first corresponds to inputs
with shape (batch_size, channels, height, width).
"""
def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
super().__init__()
self.weight = nn.Parameter(torch.ones(normalized_shape))
self.bias = nn.Parameter(torch.zeros(normalized_shape))
self.eps = eps
self.data_format = data_format
if self.data_format not in ["channels_last", "channels_first"]:
raise NotImplementedError
self.normalized_shape = (normalized_shape, )
def forward(self, x):
if self.data_format == "channels_last":
return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
elif self.data_format == "channels_first":
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.eps)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
class GRN(nn.Module):
""" GRN (Global Response Normalization) layer
"""
def __init__(self, dim):
super().__init__()
self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
def forward(self, x):
Gx = torch.norm(x, p=2, dim=(1,2), keepdim=True)
Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
return self.gamma * (x * Nx) + self.beta + x
class Block(nn.Module):
""" ConvNeXtV2 Block.
Args:
dim (int): Number of input channels.
drop_path (float): Stochastic depth rate. Default: 0.0
"""
def __init__(self, dim, drop_path=0.):
super().__init__()
self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
self.norm = LayerNorm(dim, eps=1e-6)
self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
self.act = nn.GELU()
self.grn = GRN(4 * dim)
self.pwconv2 = nn.Linear(4 * dim, dim)
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
def forward(self, x):
input = x
x = self.dwconv(x)
x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
x = self.norm(x)
x = self.pwconv1(x)
x = self.act(x)
x = self.grn(x)
x = self.pwconv2(x)
x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
x = input + self.drop_path(x)
return x
@BACKBONE_REGISTRY.register()
class ConvNeXtV2(Backbone):
""" ConvNeXt V2
Args:
in_chans (int): Number of input image channels. Default: 3
num_classes (int): Number of classes for classification head. Default: 1000
depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
drop_path_rate (float): Stochastic depth rate. Default: 0.
head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
"""
def __init__(self, cfg, input_shape):
super().__init__()
in_chans = cfg.MODEL.CONVNEXTV2.IN_CHANS
depths = cfg.MODEL.CONVNEXTV2.DEPTHS
dims = cfg.MODEL.CONVNEXTV2.DIMS
drop_path_rate = cfg.MODEL.CONVNEXTV2.DROP_PATH_RATE
self.depths = depths
self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers
stem = nn.Sequential(
nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
)
self.downsample_layers.append(stem)
for i in range(3):
downsample_layer = nn.Sequential(
LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2),
)
self.downsample_layers.append(downsample_layer)
self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks
dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
cur = 0
for i in range(4):
stage = nn.Sequential(
*[Block(dim=dims[i], drop_path=dp_rates[cur + j]) for j in range(depths[i])]
)
self.stages.append(stage)
cur += depths[i]
self._out_features = cfg.MODEL.CONVNEXTV2.OUT_FEATURES
self._out_feature_strides = {
"res2": 4,
"res3": 8,
"res4": 16,
"res5": 32,
}
self._out_feature_channels = {
"res2": dims[0],
"res3": dims[1],
"res4": dims[2],
"res5": dims[3],
}
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, (nn.Conv2d, nn.Linear)):
trunc_normal_(m.weight, std=.02)
nn.init.constant_(m.bias, 0)
def forward(self, x):
out = {}
for i in range(4):
x = self.downsample_layers[i](x)
x = self.stages[i](x)
out["res{}".format(i+2)] = x
return out
def output_shape(self):
return {
name: ShapeSpec(
channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
)
for name in self._out_features
}
I download ConvNeXt V2-L 384x384 weights, and delete "head.bias", "head.weight", "norm.bias", "norm.weight" because for backbone I'm not using these weights. Training loss continuously decrease and everything goes well.
But when i shift to ConvNeXt V2-B 384x384 weights. Training loss became super unstable, and barely decrease.
I'm also using some other custom backbone, and nothing like this happened.
完全按照教程操作的,但是到cd MinkowskiEngine这一步报错,说是没有这个目录文件
The first ConvNext has an MIT license. Might ConvNext V2 be open sourced in the future? It's sad to see licensing changes between V1 and V2.
Nice work ! based on the benchmarks in the paper and also this repository, it seems that all ImageNet-1K benchmarks use self-supervised pre-trained checkpoints. If this is the case, I would like to ask if authors can provide ImageNet-1K benchmarks for models that are trained from scratch. This enables direct comparison to other models which don't use self-supervised pre-training which constitute a large fraction of the literature.
The official re-training weights:
model: OrderedDict
downsample_layers.0.0.bias: Tensor with shape (128,)
downsample_layers.0.0.weight: Tensor with shape (128, 3, 4, 4)
downsample_layers.0.1.bias: Tensor with shape (128,)
downsample_layers.0.1.weight: Tensor with shape (128,)
downsample_layers.1.0.bias: Tensor with shape (128,)
downsample_layers.1.0.weight: Tensor with shape (128,)
downsample_layers.1.1.bias: Tensor with shape (256,)
downsample_layers.1.1.weight: Tensor with shape (256, 128, 2, 2)
downsample_layers.2.0.bias: Tensor with shape (256,)
downsample_layers.2.0.weight: Tensor with shape (256,)
downsample_layers.2.1.bias: Tensor with shape (512,)
downsample_layers.2.1.weight: Tensor with shape (512, 256, 2, 2)
downsample_layers.3.0.bias: Tensor with shape (512,)
downsample_layers.3.0.weight: Tensor with shape (512,)
downsample_layers.3.1.bias: Tensor with shape (1024,)
downsample_layers.3.1.weight: Tensor with shape (1024, 512, 2, 2)
.....
The model parameters:
mask_token: Tensor with shape (1, 512, 1, 1)
encoder.downsample_layers.0.0.weight: Tensor with shape (128, 3, 4, 4)
encoder.downsample_layers.0.0.bias: Tensor with shape (128,)
encoder.downsample_layers.0.1.weight: Tensor with shape (128,)
encoder.downsample_layers.0.1.bias: Tensor with shape (128,)
encoder.downsample_layers.1.0.ln.weight: Tensor with shape (128,)
encoder.downsample_layers.1.0.ln.bias: Tensor with shape (128,)
encoder.downsample_layers.1.1.kernel: Tensor with shape (4, 128, 256)
encoder.downsample_layers.1.1.bias: Tensor with shape (1, 256)
encoder.downsample_layers.2.0.ln.weight: Tensor with shape (256,)
encoder.downsample_layers.2.0.ln.bias: Tensor with shape (256,)
encoder.downsample_layers.2.1.kernel: Tensor with shape (4, 256, 512)
encoder.downsample_layers.2.1.bias: Tensor with shape (1, 512)
encoder.downsample_layers.3.0.ln.weight: Tensor with shape (512,)
encoder.downsample_layers.3.0.ln.bias: Tensor with shape (512,)
encoder.downsample_layers.3.1.kernel: Tensor with shape (4, 512, 1024)
encoder.downsample_layers.3.1.bias: Tensor with shape (1, 1024)
Dear Author,
In paper, you mentioned that masking is done on the raw images. However, in put code, masking is only done after the stem layer. Can you explain the inconsistency? Thank you!
good work,thank you.
but the training is very slow when trainning convnextv2_base,the GPU-Utilization is only about 50%,is there any solutions?thanks
I have implemented ConvNeXtV2 in TensorFlow 2.X:
https://github.com/edwardyehuang/iSeg/blob/master/backbones/convnext_v2.py
The weights can be found here:
https://github.com/edwardyehuang/iSeg/tree/master/backbones
This implementation is compatible with TPU, determinism, and mixed precision on TensorFlow. It also has been tested on downstream tasks.
Dear authors,
I have played around both ConvNeXt v1 and yours using TIMM codebase with my own datasets.
Using V1 I don't struggle with training/finetuning for my datasets and am pleasure with my obtained overall performance for TIMM's variants.
However, I can not achieve any comparative performance (overall accuracy as well as computed costs, of course) using your V2 variants with regarding every pretrained weights.
Can you give me any tip, trick, or treat for a set of your hyperparameters?
Thank in advance.
Linh
Great work! I try to use the GRN in another model, but when I check the source code, I find one mismatch. In papaer, equation 2 uses the sum of gx as the denominator, but in this implementation, GRN module uses the mean operator. Don't whether this will affect the final result.
Hi! Really nice work!
I wonder what is the sup. training setting for ConvNeXt V1-A Supervised (75.7%) in Table 14. Is it also 300 epochs, the same as ConvNeXt -Tiny from the ConvNeXt V1 paper?
If not, could you share the sup. training setting for atto, pico, and nano models?
May I know when the semantic_segmentation version of Convnextv2 will be released
I modify the code of convnextv1 in mmdet, including adding GRN and deleting LayerScale.
I use the checkpoint "convnextv2_base_1k_224_fcmae.pt" to initialize model weights, and I also use "remap_checkpoint_keys" to reshape grn affine parameters and biases.
No error was reported when loading checkpoint, but after 1 epoch, I only get about 1m AP in coco.
How to get the predicted results of the test set?
Hello. Thank you for publishing the implementation of your exciting work!
I have a question regarding usage of sparse convolutional modules from MinkowskiEngine.
You use dimension=3
both in depthwise and strided convolutions. Why is this the case since images only consist of 2 spatial dimensions?
Thank you for providing insights around this :)
Thank you for your outstanding work. If I want to train a segmentation network, how can I modify the code?
Does the apex in the INSTALL.md have any function?
I have tried many different methods to install the content of "pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./", but they are all not working.
The error is "no module named 'packaging'" error: subprocess-exited-with-error
Without this apex, it seems to be also able to fine-tune the model. Could anyone help me explain the function of apex, or help me install it?
Thanks very very much for any help!
Hello, great work.
In Install.md it says: (Note: we have implemented a customized CUDA kernel for depth-wise convolutions, which the original MinkowskiEngine does not support.)
but in the code, it still has from MinkowskiEngine import MinkowskiDepthwiseConvolution
So what do you mean by that?
I found that there is MinkowskiChannelwiseConvolution.py in MinkowskiEngine, is it possible to replace MinkowskiDepthwiseConvolution in the code with MinkowskiChannelwiseConvolution, or some another solution? I hope I can get your answer, thanks!
Translated with www.DeepL.com/Translator (free version)
I try to reproduce the results that use supervised training from scratch. However, I cannot reproduce the results reported in the paper. Specifically, for the base model, the reported top-1 ACC is 84.3 (Table 14). However, my reproduced results are 77.7 (100 ep) and 80.6 (300ep). See the logs here.
By the way, can you release the pretraining/finetuning logs for our reference and comparison? It is too expensive for us to reproduce these results.
Related issue: #37
Line 186 in 2553895
The above function implements the MAE training logic. If I understand correctly, x
after self.forward_encoder()
should have been downsampled 4 times after patchification. See
ConvNeXt-V2/models/convnextv2_sparse.py
Line 122 in 2553895
Then, pred
should have the same spatial dimension as x
. self.proj
and self.decoder
preserve spatial dimensions. self.pred
only recovers the in-patch dimensions, preserving the number of patches.
Overall, pred
should have 4^4
times fewer patches than self.patchify(imgs)
. However, they are directly put in a subtraction in
Line 164 in 2553895
What am I missing here?
Essentially, how do you recovered the downsampled number of patches if you decode from the lowest-resolution encodings?
Dear authors,
I tried to change the size of the patches from 32x32 to 16x16. When decreasing the size of the patches, there is an error in the upsample_mask
function in sparse_convnextv2.py
file. This error is related to the size of the upsampled mask and the downsampled features. When decreasing the size of the patches, should we change the stride of the stem or add more layers in the encoding part? This is not very clear for me.
Best,
Colin Decourt
Hi, the work you've done is amazing, just like MAE. However, I found that the pre-trained model with decoder is missing when I was using main_pretrain. May I ask if you could release it? Thank you
Hi,Thank you for open source. self-supervised pre-trained weights miss Parameter. The model parameters are incomplete. The parameter name has been changed
Describe the bug
I installed it according to the installation steps. I used ema in my project, which used deepcopy code, and this error occurred when the project was running.
“TypeError: cannot pickle 'MinkowskiConvolutionFunction' object”
Expected behavior
The project is running normally
Desktop (please complete the following information):
OS: [e.g. Ubuntu 18.04]
Python version: [e.g. 3.8.13]
Pytorch version: [e.g. 1.10.1]
CUDA version: [e.g. 11.3]
NVIDIA Driver version: [e.g. 510.60]
Minkowski Engine version [e.g. 0.5.4]
cd MinkowskiEngine
python setup.py install --blas_include_dirs=${CONDA_PREFIX}/include --blas=openblas
reported an error:
/ConvNeXt-V2/MinkowskiEngine/src/3rdparty/concurrent_unordered_map.cuh(503): error: namespace "thrust" has no member "device"
1 error detected in the compilation of "/home/mjddh/ConvNeXt-V2/MinkowskiEngine/src/broadcast_gpu.cu".
error: command '/usr/local/cuda-12.0/bin/nvcc' failed with exit code 2
The following image is the result of epoch=69.
The reconstruction of unmasked part have lots of noise, is this a normal phenomenon?
And I have found a strange problem. If i replace the first conv in the stem with MinkowskiDepthwiseConvolution, the training loss will quickly drop to 0. I can't understand why this is happening.
While reading the TRAINING.md I noticed that the mask is applied twice in the Jax's Block
implementation:
@nn.compact
def __call__(self, inputs, mask=None):
if mask is not None:
x = inputs * (1. - mask)
x = DepthwiseConv2D((7, 7), name='dwconv')(x)
if mask is not None: # The binary masking is numerically identical to sparse conv.
x = x * (1.- mask)
(...)
What benefit does this bring? Couldn't the mask be only applied to the output with the same result? Aka something like:
@nn.compact
def __call__(self, inputs, mask=None):
x = DepthwiseConv2D((7, 7), name='dwconv')(x)
if mask is not None: # The binary masking is numerically identical to sparse conv.
x = x * (1.- mask)
I would greatly appreciate some hint.
Hello,
I just want to ask, what is the functional difference between the implemented layernorm vs layernorm in pytorch? (not the sparse layer norm).
Would using layernorm from pytorch impact performance?
Hi, I cant find the lr_scale parameter in the function description for torch.optim.AdamW. Is this a typo or did you mean to pass the parameter groups with lr instead of lr_scale. Shown below is the snippet from optim_factory.py lines 123-132 :
parameter_group_names[group_name] = {
"weight_decay": this_weight_decay,
"params": [],
"lr_scale": scale
}
parameter_group_vars[group_name] = {
"weight_decay": this_weight_decay,
"params": [],
"lr_scale": scale
}
Could you provide the visualization code and how to compute feature consine distance? Thank you very much for your help.
In the paper it was writen that "sweep layer-wise learning rate decay in {0.9, 0.95}, stochastic depth rate in {0.2, 0.3, 0.4, 0.5}". I'd like to know which of them are your recommended parameters with ConvNeXt-V2 Base Mask-RCNN in COCO task.
Amazing work and an absolutely lovely read to start 2023 with :)
Did a quick implementation in Keras/TF.
Should add SparseConvNeXtV2 and the masked autoencoder training framework.
Basic architecture available at: https://github.com/DavidLandup0/keras-cv/tree/convnextv2
class GRN(nn.Module):
""" GRN (Global Response Normalization) layer
"""
def init(self, dim):
super().init()
self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
def forward(self, x):
Gx = torch.norm(x, p=2, dim=(1,2), keepdim=True)
Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
return self.gamma * (x * Nx) + self.beta + x
What is the difference between the custom LayerNorm and BN、LN(pytorch) in the code。
Hello authors,
I'm so excited to see that MAE can also work on ConvNets! I was wondering that are the sparseconv MAE and GRN architecture-agnostic that they can also work on other CNNs like a vanilla ResNet without the ConvNeXt tricks? Have you done any experiments on this? Hope you can share some insights.
Thanks!
What's the efficiency comparison between
Specifically, I'd like to know the following statistics in the pertaining stage for the two variants:
As far as I know, sparse operators are much less efficient in modern hardware. According to Nvidia blog:
Even though sparse linear algebra allows representing huge matrices very efficiently, it typically does not provide competitive performance compared to dense counterparts in cases when sparsity is below 95%. This is due to irregular computation and scattered memory accesses. In fact, many of the linear algebra applications that benefit from sparsity have over 99% sparsity in their matrices.
Even a highly optimized block-sparse kernel with a strict assumption on the sparse pattern still requires 40%~50% sparsity to counteract the hardware inefficiency. Hence, as far as I'm concerned, a 60% mask rate (i.e. 60% sparsity) is not enough to accelerate the pertaining with sparse convolution, if it is not slower than dense convolution + binary masking.
Hi, very interesting and nice work! Thanks a lot!
I wonder the implementation of the "Feature collapse" visualization.
Would you mind share the related visualization code?
Any response is appreciated.
criterion = LabelSmoothingCrossEntropy()
Traceback (most recent call last):
File "main_finetune.py", line 437, in
main(args)
File "main_finetune.py", line 351, in main
optimizer=optimizer, loss_scaler=loss_scaler, model_ema=model_ema)
File "/media/wangshuang/sdb/object_class/ConvNeXt-V2/utils.py", line 495, in auto_load_model
model_without_ddp.load_state_dict(checkpoint['model'])
File "/home/wangshuang/anaconda3/envs/yolov8/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1224, in load_state_dict
self.class.name, "\n\t".join(error_msgs)))
RuntimeError: Error(s) in loading state_dict for ConvNeXtV2:
Missing key(s) in state_dict: "downsample_layers.0.0.weight",
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.