This is probably user error but I tried to do a flox-powered histogram of a dataset th

Reproducer: <div class="highlight highlight-source-python notranslate position-rel

Error about updating variables corrupting an index about flox HOT 3 CLOSED

TomNicholas commented on June 12, 2024

Error about updating variables corrupting an index

from flox.

Comments (3)

dcherian commented on June 12, 2024 1

A small example would help but it seems to be an issue with propagating that multiindex.

I think that line is trying to propagate non dimension coordinates but probably needs to be updated to the new .indexes model

set(ds_broad.variables) - set(ds_broad.dims)

Perhaps this should be

set(ds_broad.variables) - set(ds_board.xindexes) - set(ds_broad.dims)

from flox.

TomNicholas commented on June 12, 2024

Reproducer:

import xarray as xr
import pandas as pd
import numpy as np

from flox.xarray import xarray_reduce


vort = xr.DataArray(
    name='vort',
    data=np.random.uniform(size=(4, 2)), 
    dims=['i', 'face'],
    coords={
        'i': ('i', np.arange(4)), 
        'face': ('face', np.arange(2))
    }
)

regions = vort.coarsen(
    i=2,
).construct(
    i=("i_region_coarse", "i_region"),
).stack(region=['face', 'i_region_coarse'])


def hist(*args, dim, bins, weights=None):
    
    if weights is None:
        weights = xr.DataArray(1)
    
    bin_intervals = tuple(pd.IntervalIndex.from_breaks(b) for b in bins)
    
    result = xarray_reduce(
        weights,                        # weights
        *args,                          # variables we want to bin
        func="count",                   # count occurrences falling in bins
        expected_groups=bin_intervals,  # bins for each variable
        dim=dim,                        # broadcast dimensions
        fill_value=np.NaN,
    )
    
    return result


h = hist(
    regions,
    dim=['i_region'],
    bins=[np.linspace(0, 1, 10)],
)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[54], line 1
----> 1 h = hist(
      2     regions,
      3     dim=['i_region'],
      4     bins=[np.linspace(0, 1, 10)],
      5 )

Cell In[53], line 11, in hist(dim, bins, weights, *args)
      7     weights = xr.DataArray(1)
      9 bin_intervals = tuple(pd.IntervalIndex.from_breaks(b) for b in bins)
---> 11 result = xarray_reduce(
     12     weights,                        # weights
     13     *args,                          # variables we want to bin
     14     func="count",                   # count occurrences falling in bins
     15     expected_groups=bin_intervals,  # bins for each variable
     16     dim=dim,                        # broadcast dimensions
     17     fill_value=np.NaN,
     18 )
     20 return result

File /srv/conda/envs/notebook/lib/python3.10/site-packages/flox/xarray.py:436, in xarray_reduce(obj, func, expected_groups, isbin, sort, dim, fill_value, dtype, method, engine, keep_attrs, skipna, min_count, reindex, *by, **finalize_kwargs)
    434 for var in set(ds_broad.variables) - set(ds_broad.dims):
    435     if all(d not in ds_broad[var].dims for d in dim_tuple):
--> 436         actual[var] = ds_broad[var]
    438 for name, expect, by_ in zip(group_names, expected_groups, by_da):
    439     # Can't remove this till xarray handles IntervalIndex
    440     if isinstance(expect, pd.IntervalIndex):

File /srv/conda/envs/notebook/lib/python3.10/site-packages/xarray/core/dataset.py:1491, in Dataset.__setitem__(self, key, value)
   1486     if isinstance(value, Dataset):
   1487         raise TypeError(
   1488             "Cannot assign a Dataset to a single key - only a DataArray or Variable "
   1489             "object can be stored under a single key."
   1490         )
-> 1491     self.update({key: value})
   1493 elif utils.iterable_of_hashable(key):
   1494     keylist = list(key)

File /srv/conda/envs/notebook/lib/python3.10/site-packages/xarray/core/dataset.py:4945, in Dataset.update(self, other)
   4909 def update(self: T_Dataset, other: CoercibleMapping) -> T_Dataset:
   4910     """Update this dataset's variables with those from another dataset.
   4911 
   4912     Just like :py:meth:`dict.update` this is a in-place operation.
   (...)
   4943     Dataset.merge
   4944     """
-> 4945     merge_result = dataset_update_method(self, other)
   4946     return self._replace(inplace=True, **merge_result._asdict())

File /srv/conda/envs/notebook/lib/python3.10/site-packages/xarray/core/merge.py:1093, in dataset_update_method(dataset, other)
   1090             if coord_names:
   1091                 other[key] = value.drop_vars(coord_names)
-> 1093 return merge_core(
   1094     [dataset, other],
   1095     priority_arg=1,
   1096     indexes=dataset.xindexes,
   1097     combine_attrs="override",
   1098 )

File /srv/conda/envs/notebook/lib/python3.10/site-packages/xarray/core/merge.py:746, in merge_core(objects, compat, join, combine_attrs, priority_arg, explicit_coords, indexes, fill_value)
    744 collected = collect_variables_and_indexes(aligned, indexes=indexes)
    745 prioritized = _get_priority_vars_and_indexes(aligned, priority_arg, compat=compat)
--> 746 variables, out_indexes = merge_collected(
    747     collected, prioritized, compat=compat, combine_attrs=combine_attrs
    748 )
    750 dims = calculate_dimensions(variables)
    752 coord_names, noncoord_names = determine_coords(coerced)

File /srv/conda/envs/notebook/lib/python3.10/site-packages/xarray/core/merge.py:243, in merge_collected(grouped, prioritized, compat, combine_attrs, equals)
    240     equals = {}
    242 _assert_compat_valid(compat)
--> 243 _assert_prioritized_valid(grouped, prioritized)
    245 merged_vars: dict[Hashable, Variable] = {}
    246 merged_indexes: dict[Hashable, Index] = {}

File /srv/conda/envs/notebook/lib/python3.10/site-packages/xarray/core/merge.py:190, in _assert_prioritized_valid(grouped, prioritized)
    188 common_names_str = ", ".join(f"{k!r}" for k in common_names)
    189 index_names_str = ", ".join(f"{k!r}" for k in index_coord_names)
--> 190 raise ValueError(
    191     f"cannot set or update variable(s) {common_names_str}, which would corrupt "
    192     f"the following index built from coordinates {index_names_str}:\n"
    193     f"{indexes[index_id]!r}"
    194 )

ValueError: cannot set or update variable(s) 'i_region_coarse', which would corrupt the following index built from coordinates 'region', 'face', 'i_region_coarse':
PandasIndex(MultiIndex([(0, 0),
            (0, 1),
            (1, 0),
            (1, 1)],
           name='region'))

I can have a go at fixing it now though.

from flox.

TomNicholas commented on June 12, 2024

Your suggestion appears to fix it! I'll make a PR

from flox.

Error about updating variables corrupting an index about flox HOT 3 CLOSED

Comments (3)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent

Jobs