Comments (13)
this is full fixed code
// Hossein Moein
// September 12, 2017
/*
Copyright (c) 2019-2026, Hossein Moein
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution. - Neither the name of Hossein Moein and/or the DataFrame nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL Hossein Moein BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <DataFrame/DataFrame.h>
#include
// ----------------------------------------------------------------------------
namespace hmdf
{
template<typename I, typename H>
template<typename RHS_T, typename ... Ts>
DataFrame<I, H>
DataFrame<I, H>::
join_by_index (const RHS_T &rhs, join_policy mp) const {
static_assert(
std::is_base_of<DataFrame<I, HeteroVector<std::size_t(H::align_value)>>,
RHS_T>::value ||
std::is_base_of<View, RHS_T>::value ||
std::is_base_of<PtrView, RHS_T>::value,
"The rhs argument to join_by_index() can only be "
"StdDataFrame<IndexType> or DataFrame[Ptr]View<IndexType>");
const auto &lhs_idx = get_index();
const auto &rhs_idx = rhs.get_index();
const size_type lhs_idx_s = lhs_idx.size();
const size_type rhs_idx_s = rhs_idx.size();
StlVecType<JoinSortingPair<IndexType>> idx_vec_lhs;
StlVecType<JoinSortingPair<IndexType>> idx_vec_rhs;
idx_vec_lhs.reserve(lhs_idx_s);
for (size_type i = 0; i < lhs_idx_s; ++i)
idx_vec_lhs.push_back(std::make_pair(&(lhs_idx[i]), i));
idx_vec_rhs.reserve(rhs_idx_s);
for (size_type i = 0; i < rhs_idx_s; ++i)
idx_vec_rhs.push_back(std::make_pair(&(rhs_idx[i]), i));
auto cf = [] (const JoinSortingPair<IndexType> &l,
const JoinSortingPair<IndexType> &r) -> bool {
return (*(l.first) < *(r.first));
};
std::sort(idx_vec_lhs.begin(), idx_vec_lhs.end(), cf);
std::sort(idx_vec_rhs.begin(), idx_vec_rhs.end(), cf);
switch(mp) {
case join_policy::inner_join:
return (index_inner_join_
<decltype(*this), RHS_T, Ts ...>
(*this, rhs, idx_vec_lhs, idx_vec_rhs));
case join_policy::left_join:
return (index_left_join_
<decltype(*this), RHS_T, Ts ...>
(*this, rhs, idx_vec_lhs, idx_vec_rhs));
case join_policy::right_join:
return (index_right_join_
<decltype(*this), RHS_T, Ts ...>
(*this, rhs, idx_vec_lhs, idx_vec_rhs));
case join_policy::left_right_join:
default:
return (index_left_right_join_
<decltype(*this), RHS_T, Ts ...>
(*this, rhs, idx_vec_lhs, idx_vec_rhs));
}
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename RHS_T, typename T, typename ... Ts>
DataFrame<unsigned int, H>
DataFrame<I, H>::
join_by_column (const RHS_T &rhs, const char *name, join_policy mp) const {
static_assert(
std::is_base_of<
DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
RHS_T>::value ||
std::is_base_of<View, RHS_T>::value ||
std::is_base_of<PtrView, RHS_T>::value,
"The rhs argument to join_by_column() can only be "
"StdDataFrame<IndexType> or DataFrame[Ptr]View<IndexType>");
const auto &lhs_vec = get_column<T>(name);
const auto &rhs_vec = rhs.template get_column<T>(name);
const size_type lhs_vec_s = lhs_vec.size();
const size_type rhs_vec_s = rhs_vec.size();
StlVecType<JoinSortingPair<T>> col_vec_lhs;
StlVecType<JoinSortingPair<T>> col_vec_rhs;
col_vec_lhs.reserve(lhs_vec_s);
for (size_type i = 0; i < lhs_vec_s; ++i)
col_vec_lhs.push_back(std::make_pair(&(lhs_vec[i]), i));
col_vec_rhs.reserve(rhs_vec_s);
for (size_type i = 0; i < rhs_vec_s; ++i)
col_vec_rhs.push_back(std::make_pair(&(rhs_vec[i]), i));
auto cf = [] (const JoinSortingPair<T> &l,
const JoinSortingPair<T> &r) -> bool {
return (*(l.first) < *(r.first));
};
std::sort(col_vec_lhs.begin(), col_vec_lhs.end(), cf);
std::sort(col_vec_rhs.begin(), col_vec_rhs.end(), cf);
switch(mp) {
case join_policy::inner_join:
return (column_inner_join_
<decltype(*this), RHS_T, T, Ts ...>
(*this, rhs, name, col_vec_lhs, col_vec_rhs));
case join_policy::left_join:
return (column_left_join_
<decltype(*this), RHS_T, T, Ts ...>
(*this, rhs, name, col_vec_lhs, col_vec_rhs));
case join_policy::right_join:
return (column_right_join_
<decltype(*this), RHS_T, T, Ts ...>
(*this, rhs, name, col_vec_lhs, col_vec_rhs));
case join_policy::left_right_join:
default:
return (column_left_right_join_
<decltype(*this), RHS_T, T, Ts ...>
(*this, rhs, name, col_vec_lhs, col_vec_rhs));
}
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename IDX_T, typename ... Ts>
void DataFrame<I, H>::
join_helper_common_(
const LHS_T &lhs,
const RHS_T &rhs,
const IndexIdxVector &joined_index_idx,
DataFrame<IDX_T, HeteroVectorstd::size_t(H::align_value)> &result,
const char *skip_col_name) {
const SpinGuard guard(lock_);
// Load the common and lhs columns
for (const auto &iter : lhs.column_list_) {
auto rhs_citer = rhs.column_tb_.find(iter.first);
if (skip_col_name && iter.first == skip_col_name) continue;
// Common column between two frames
if (rhs_citer != rhs.column_tb_.end()) {
index_join_functor_common_<decltype(result), Ts ...> functor(
iter.first.c_str(),
rhs,
joined_index_idx,
result);
lhs.data_[iter.second].change(functor);
}
else { // lhs only column
// 0 = Left
index_join_functor_oneside_<0, decltype(result), Ts ...> functor (
iter.first.c_str(),
joined_index_idx,
result);
lhs.data_[iter.second].change(functor);
}
}
// Load the rhs columns
for (const auto &iter : rhs.column_list_) {
auto lhs_citer = lhs.column_tb_.find(iter.first);
if (skip_col_name && iter.first == skip_col_name) continue;
if (lhs_citer == lhs.column_tb_.end()) { // rhs only column
// 1 = Right
index_join_functor_oneside_<1, decltype(result), Ts ...> functor (
iter.first.c_str(),
joined_index_idx,
result);
rhs.data_[iter.second].change(functor);
}
}
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename ... Ts>
DataFrame<I, HeteroVectorstd::size_t(H::align_value)> DataFrame<I, H>::
index_join_helper_(const LHS_T &lhs,
const RHS_T &rhs,
const IndexIdxVector &joined_index_idx) {
DataFrame<IndexType, HeteroVector<align_value>> result;
StlVecType<IndexType> result_index;
// Load the index
result_index.reserve(joined_index_idx.size());
for (auto citer : joined_index_idx) {
const size_type left_i = std::get<0>(citer);
result_index.push_back(
left_i != std::numeric_limits<size_type>::max()
? lhs.indices_[left_i] : rhs.indices_[std::get<1>(citer)]);
}
result.load_index(std::move(result_index));
join_helper_common_<LHS_T, RHS_T, IndexType, Ts ...>
(lhs, rhs, joined_index_idx, result);
return(result);
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename T, typename ... Ts>
DataFrame<unsigned int, HeteroVectorstd::size_t(H::align_value)>
DataFrame<I, H>::
column_join_helper_(const LHS_T &lhs,
const RHS_T &rhs,
const char *col_name,
const IndexIdxVector &joined_index_idx) {
using left_idx_t = typename std::remove_reference<LHS_T>::type::IndexType;
using right_idx_t = typename std::remove_reference<RHS_T>::type::IndexType;
const size_type jii_s =
joined_index_idx.size();
DataFrame<unsigned int, HeteroVector<align_value>> result;
// Load the new result index
result.load_index(
DataFrame<unsigned int, HeteroVector<align_value>>::gen_sequence_index(
0, static_cast<unsigned int>(jii_s), 1));
// Load the lhs and rhs indices into two columns in the result
// Also load the unified named column
StlVecType<left_idx_t> lhs_index;
StlVecType<right_idx_t> rhs_index;
StlVecType<T> named_col_vec;
const ColumnVecType<T> &lhs_named_col_vec =
lhs.template get_column<T>(col_name);
const ColumnVecType<T> &rhs_named_col_vec =
rhs.template get_column<T>(col_name);
lhs_index.reserve(jii_s);
rhs_index.reserve(jii_s);
named_col_vec.reserve(jii_s);
for (auto citer : joined_index_idx) {
const size_type left_i = std::get<0>(citer);
const size_type right_i = std::get<1>(citer);
if (left_i != std::numeric_limits<size_type>::max()) {
lhs_index.push_back(lhs.indices_[left_i]);
named_col_vec.push_back(lhs_named_col_vec[left_i]);
}
else {
named_col_vec.push_back(rhs_named_col_vec[right_i]);
lhs_index.push_back(get_nan<left_idx_t>());
}
if (right_i != std::numeric_limits<size_type>::max())
rhs_index.push_back(rhs.indices_[right_i]);
else
rhs_index.push_back(get_nan<right_idx_t>());
}
{
char buffer[64];
const SpinGuard guard(lock_);
::snprintf(buffer, sizeof(buffer) - 1, "lhs.%s", DF_INDEX_COL_NAME);
result.template load_column<left_idx_t>(buffer,
std::move(lhs_index),
nan_policy::pad_with_nans,
false);
::snprintf(buffer, sizeof(buffer) - 1, "rhs.%s", DF_INDEX_COL_NAME);
result.template load_column<right_idx_t>(buffer,
std::move(rhs_index),
nan_policy::pad_with_nans,
false);
result.template load_column<T>(col_name,
std::move(named_col_vec),
nan_policy::pad_with_nans,
false);
}
join_helper_common_<LHS_T, RHS_T, unsigned int, Ts ...>
(lhs, rhs, joined_index_idx, result, col_name);
return(result);
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template
typename DataFrame<I, H>::IndexIdxVector
DataFrame<I, H>::get_inner_index_idx_vector_(
const StlVecType<JoinSortingPair> &col_vec_lhs,
const StlVecType<JoinSortingPair> &col_vec_rhs) {
size_type lhs_current = 0;
const size_type lhs_end = col_vec_lhs.size();
size_type rhs_current = 0;
const size_type rhs_end = col_vec_rhs.size();
IndexIdxVector joined_index_idx;
joined_index_idx.reserve(std::min(lhs_end, rhs_end));
while (lhs_current != lhs_end && rhs_current != rhs_end) {
if (*(col_vec_lhs[lhs_current].first) <
*(col_vec_rhs[rhs_current].first))
lhs_current += 1;
else {
if (*(col_vec_lhs[lhs_current].first) ==
*(col_vec_rhs[rhs_current].first))
joined_index_idx.emplace_back(
col_vec_lhs[lhs_current++].second,
col_vec_rhs[rhs_current].second);
else //add this row to fix
rhs_current += 1;
}
}
return (joined_index_idx);
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename ... Ts>
DataFrame<I, HeteroVectorstd::size_t(H::align_value)> DataFrame<I, H>::
index_inner_join_(const LHS_T &lhs,
const RHS_T &rhs,
const StlVecType<JoinSortingPair> &col_vec_lhs,
const StlVecType<JoinSortingPair> &col_vec_rhs) {
return (index_join_helper_<LHS_T, RHS_T, Ts ...>
(lhs, rhs,
get_inner_index_idx_vector_<IndexType>(col_vec_lhs, col_vec_rhs)));
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename T, typename ... Ts>
DataFrame<unsigned int, HeteroVectorstd::size_t(H::align_value)>
DataFrame<I, H>::
column_inner_join_(const LHS_T &lhs,
const RHS_T &rhs,
const char *col_name,
const StlVecType<JoinSortingPair> &col_vec_lhs,
const StlVecType<JoinSortingPair> &col_vec_rhs) {
return (column_join_helper_<LHS_T, RHS_T, T, Ts ...>
(lhs, rhs, col_name,
get_inner_index_idx_vector_<T>(col_vec_lhs, col_vec_rhs)));
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template
typename DataFrame<I, H>::IndexIdxVector
DataFrame<I, H>::get_left_index_idx_vector_(
const StlVecType<JoinSortingPair> &col_vec_lhs,
const StlVecType<JoinSortingPair> &col_vec_rhs) {
size_type lhs_current = 0;
const size_type lhs_end = col_vec_lhs.size();
size_type rhs_current = 0;
const size_type rhs_end = col_vec_rhs.size();
IndexIdxVector joined_index_idx;
joined_index_idx.reserve(lhs_end);
while (lhs_current != lhs_end || rhs_current != rhs_end) {
if (lhs_current >= lhs_end) break;
if (rhs_current >= rhs_end) {
joined_index_idx.emplace_back(
col_vec_lhs[lhs_current++].second,
std::numeric_limits<size_type>::max());
continue;
}
if (*(col_vec_lhs[lhs_current].first) <
*(col_vec_rhs[rhs_current].first))
joined_index_idx.emplace_back(
col_vec_lhs[lhs_current++].second,
std::numeric_limits<size_type>::max());
else {
if (*(col_vec_lhs[lhs_current].first) ==
*(col_vec_rhs[rhs_current].first))
joined_index_idx.emplace_back(col_vec_lhs[lhs_current++].second,
col_vec_rhs[rhs_current].second);
else // add this row fix
rhs_current += 1;
}
}
return (joined_index_idx);
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename ... Ts>
DataFrame<I, HeteroVectorstd::size_t(H::align_value)> DataFrame<I, H>::
index_left_join_(const LHS_T &lhs, const RHS_T &rhs,
const StlVecType<JoinSortingPair> &col_vec_lhs,
const StlVecType<JoinSortingPair> &col_vec_rhs) {
return (index_join_helper_<LHS_T, RHS_T, Ts ...>
(lhs, rhs,
get_left_index_idx_vector_<IndexType>(col_vec_lhs,
col_vec_rhs)));
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename T, typename ... Ts>
DataFrame<unsigned int, HeteroVectorstd::size_t(H::align_value)>
DataFrame<I, H>::
column_left_join_(const LHS_T &lhs,
const RHS_T &rhs,
const char *col_name,
const StlVecType<JoinSortingPair> &col_vec_lhs,
const StlVecType<JoinSortingPair> &col_vec_rhs) {
return (column_join_helper_<LHS_T, RHS_T, T, Ts ...>
(lhs, rhs, col_name,
get_left_index_idx_vector_<T>(col_vec_lhs, col_vec_rhs)));
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template
typename DataFrame<I, H>::IndexIdxVector
DataFrame<I, H>::get_right_index_idx_vector_(
const StlVecType<JoinSortingPair> &col_vec_lhs,
const StlVecType<JoinSortingPair> &col_vec_rhs) {
size_type lhs_current = 0;
const size_type lhs_end = col_vec_lhs.size();
size_type rhs_current = 0;
const size_type rhs_end = col_vec_rhs.size();
IndexIdxVector joined_index_idx;
joined_index_idx.reserve(rhs_end);
while (lhs_current != lhs_end || rhs_current != rhs_end) {
if (rhs_current >= rhs_end) break;
if (lhs_current >= lhs_end) {
joined_index_idx.emplace_back(
std::numeric_limits<size_type>::max(),
col_vec_rhs[rhs_current++].second);
continue;
}
if (*(col_vec_lhs[lhs_current].first) <
*(col_vec_rhs[rhs_current].first))
lhs_current += 1;
else {
if (*(col_vec_lhs[lhs_current].first) ==
*(col_vec_rhs[rhs_current].first))
joined_index_idx.emplace_back(
col_vec_lhs[lhs_current++].second,
col_vec_rhs[rhs_current].second);
else{
joined_index_idx.emplace_back(
std::numeric_limits<size_type>::max(),
col_vec_rhs[rhs_current].second);
rhs_current += 1;
}
}
}
return (joined_index_idx);
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename ... Ts>
DataFrame<I, HeteroVectorstd::size_t(H::align_value)> DataFrame<I, H>::
index_right_join_(const LHS_T &lhs, const RHS_T &rhs,
const StlVecType<JoinSortingPair> &col_vec_lhs,
const StlVecType<JoinSortingPair> &col_vec_rhs) {
return (index_join_helper_<LHS_T, RHS_T, Ts ...>
(lhs, rhs,
get_right_index_idx_vector_<IndexType>(col_vec_lhs,
col_vec_rhs)));
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename T, typename ... Ts>
DataFrame<unsigned int, HeteroVectorstd::size_t(H::align_value)>
DataFrame<I, H>::
column_right_join_(const LHS_T &lhs,
const RHS_T &rhs,
const char *col_name,
const StlVecType<JoinSortingPair> &col_vec_lhs,
const StlVecType<JoinSortingPair> &col_vec_rhs) {
return (column_join_helper_<LHS_T, RHS_T, T, Ts ...>
(lhs, rhs, col_name,
get_right_index_idx_vector_<T>(col_vec_lhs, col_vec_rhs)));
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template
typename DataFrame<I, H>::IndexIdxVector
DataFrame<I, H>::get_left_right_index_idx_vector_(
const StlVecType<JoinSortingPair> &col_vec_lhs,
const StlVecType<JoinSortingPair> &col_vec_rhs) {
size_type lhs_current = 0;
const size_type lhs_end = col_vec_lhs.size();
size_type rhs_current = 0;
const size_type rhs_end = col_vec_rhs.size();
IndexIdxVector joined_index_idx;
joined_index_idx.reserve(std::max(lhs_end, rhs_end));
while (lhs_current != lhs_end || rhs_current != rhs_end) {
if (lhs_current >= lhs_end && rhs_current < rhs_end) {
joined_index_idx.emplace_back(
std::numeric_limits<size_type>::max(),
col_vec_rhs[rhs_current++].second);
continue;
}
if (rhs_current >= rhs_end && lhs_current < lhs_end) {
joined_index_idx.emplace_back(
col_vec_lhs[lhs_current++].second,
std::numeric_limits<size_type>::max());
continue;
}
if (*(col_vec_lhs[lhs_current].first) <
*(col_vec_rhs[rhs_current].first)) {
joined_index_idx.emplace_back(
col_vec_lhs[lhs_current++].second,
std::numeric_limits<size_type>::max());
}
else {
if (*(col_vec_lhs[lhs_current].first) ==
*(col_vec_rhs[rhs_current].first))
joined_index_idx.emplace_back(col_vec_lhs[lhs_current++].second,
col_vec_rhs[rhs_current].second);
else
{ //add this row to fix
joined_index_idx.emplace_back(
std::numeric_limits<size_type>::max(),
col_vec_rhs[rhs_current].second);
rhs_current += 1;
} //add this row to fix
}
}
return (joined_index_idx);
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename ... Ts>
DataFrame<I, HeteroVectorstd::size_t(H::align_value)> DataFrame<I, H>::
index_left_right_join_(
const LHS_T &lhs,
const RHS_T &rhs,
const StlVecType<JoinSortingPair> &col_vec_lhs,
const StlVecType<JoinSortingPair> &col_vec_rhs) {
return (index_join_helper_<LHS_T, RHS_T, Ts ...>
(lhs, rhs,
get_left_right_index_idx_vector_<IndexType>(col_vec_lhs,
col_vec_rhs)));
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename T, typename ... Ts>
DataFrame<unsigned int, HeteroVectorstd::size_t(H::align_value)>
DataFrame<I, H>::
column_left_right_join_(const LHS_T &lhs,
const RHS_T &rhs,
const char *col_name,
const StlVecType<JoinSortingPair> &col_vec_lhs,
const StlVecType<JoinSortingPair> &col_vec_rhs) {
return (column_join_helper_<LHS_T, RHS_T, T, Ts ...>
(lhs, rhs, col_name,
get_left_right_index_idx_vector_<T>(col_vec_lhs,
col_vec_rhs)));
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename ... Ts>
void DataFrame<I, H>::
concat_helper_(LHS_T &lhs, const RHS_T &rhs, bool add_new_columns) {
const size_type orig_index_s = lhs.get_index().size();
lhs.get_index().insert(lhs.get_index().end(),
rhs.get_index().begin(), rhs.get_index().end());
// Load common columns
for (const auto &lhs_iter : lhs.column_list_) {
auto rhs_citer = rhs.column_tb_.find(lhs_iter.first);
if (rhs_citer != rhs.column_tb_.end()) {
concat_functor_<LHS_T, Ts ...> functor(lhs_iter.first.c_str(),
lhs,
false,
orig_index_s);
rhs.data_[rhs_citer->second].change(functor);
}
}
// Load columns from rhs that do not exist in lhs
if (add_new_columns) {
for (const auto &rhs_citer : rhs.column_list_) {
auto lhs_iter = lhs.column_tb_.find(rhs_citer.first);
if (lhs_iter == lhs.column_tb_.end()) {
concat_functor_<LHS_T, Ts ...> functor(rhs_citer.first.c_str(),
lhs,
true,
orig_index_s);
rhs.data_[rhs_citer.second].change(functor);
}
}
}
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename RHS_T, typename ... Ts>
void
DataFrame<I, H>::self_concat(const RHS_T &rhs, bool add_new_columns) {
static_assert(
(std::is_base_of<
DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
RHS_T>::value ||
std::is_base_of<View, RHS_T>::value ||
std::is_base_of<PtrView, RHS_T>::value) &&
! std::is_base_of<DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
decltype(*this)>::value,
"The rhs argument to self_concat() can only be "
"StdDataFrame<IndexType> or DataFrame[Ptr]View<IndexType>. "
"Self must be StdDataFrame<IndexType>");
const SpinGuard guard(lock_);
concat_helper_<decltype(*this), RHS_T, Ts ...>(*this, rhs, add_new_columns);
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename RHS_T, typename ... Ts>
DataFrame<I, H>
DataFrame<I, H>::concat(const RHS_T &rhs, concat_policy cp) const {
static_assert(
(std::is_base_of<
DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
RHS_T>::value ||
std::is_base_of<View, RHS_T>::value ||
std::is_base_of<PtrView, RHS_T>::value) &&
! std::is_base_of<DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
decltype(*this)>::value,
"The rhs argument to concat() can only be "
"StdDataFrame<IndexType> or DataFrame[Ptr]View<IndexType>. "
"Self must be StdDataFrame<IndexType>");
DataFrame<I, HeteroVector<align_value>> result;
const SpinGuard guard(lock_);
if (cp == concat_policy::all_columns ||
cp == concat_policy::lhs_and_common_columns) {
result = *this;
concat_helper_<decltype(result), RHS_T, Ts ...>(
result, rhs, cp == concat_policy::all_columns);
}
else if (cp == concat_policy::common_columns) {
result.load_index(this->get_index().begin(), this->get_index().end());
for (const auto &lhs_citer : column_list_) {
auto rhs_citer = rhs.column_tb_.find(lhs_citer.first);
if (rhs_citer != rhs.column_tb_.end()) {
load_all_functor_<Ts ...> functor(lhs_citer.first.c_str(),
result);
data_[lhs_citer.second].change(functor);
}
}
concat_helper_<decltype(result), RHS_T, Ts ...>(result, rhs, false);
}
return (result);
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename RHS_T, typename ... Ts>
typename DataFrame<I, H>::PtrView
DataFrame<I, H>::concat_view(RHS_T &rhs, concat_policy cp) {
static_assert(
! std::is_base_of<
DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
RHS_T>::value ||
! std::is_base_of<DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
decltype(*this)>::value,
"Currently, arguments to concat_view() can only be "
"StdDataFrame<IndexType>.");
PtrView result;
using idxvec_t = typename PtrView::IndexVecType;
const size_type idx_s = get_index().size();
const size_type rhs_idx_s = rhs.get_index().size();
idxvec_t result_idx;
result_idx.reserve(idx_s + rhs_idx_s);
for (size_type i = 0; i < idx_s; ++i)
result_idx.push_back(&(get_index()[i]));
for (size_type i = 0; i < rhs_idx_s; ++i)
result_idx.push_back(&(rhs.get_index()[i]));
result.indices_ = std::move(result_idx);
if (cp == concat_policy::all_columns) {
for (const auto &lhs_citer : column_list_) {
concat_load_view_functor_<PtrView, Ts ...> functor(
lhs_citer.first.c_str(), result);
data_[lhs_citer.second].change(functor);
}
for (const auto &rhs_citer : rhs.column_list_) {
concat_load_view_functor_<PtrView, Ts ...> functor(
rhs_citer.first.c_str(), result);
rhs.data_[rhs_citer.second].change(functor);
}
}
else if (cp == concat_policy::lhs_and_common_columns) {
for (const auto &lhs_citer : column_list_) {
concat_load_view_functor_<PtrView, Ts ...> functor(
lhs_citer.first.c_str(), result);
data_[lhs_citer.second].change(functor);
auto rhs_citer = rhs.column_tb_.find(lhs_citer.first);
if (rhs_citer != rhs.column_tb_.end())
rhs.data_[rhs_citer->second].change(functor);
}
}
else if (cp == concat_policy::common_columns) {
for (const auto &lhs_citer : column_list_) {
concat_load_view_functor_<PtrView, Ts ...> functor(
lhs_citer.first.c_str(), result);
auto rhs_citer =
rhs.column_tb_.find(lhs_citer.first);
if (rhs_citer != rhs.column_tb_.end()) {
data_[lhs_citer.second].change(functor);
rhs.data_[rhs_citer->second].change(functor);
}
}
}
return (result);
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename RHS_T, typename ... Ts>
typename DataFrame<I, H>::ConstPtrView
DataFrame<I, H>::concat_view(RHS_T &rhs, concat_policy cp) const {
static_assert(
! std::is_base_of<
DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
RHS_T>::value ||
! std::is_base_of<DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
decltype(*this)>::value,
"Currently, arguments to concat_view() can only be "
"StdDataFrame<IndexType>.");
ConstPtrView result;
using idxvec_t = typename ConstPtrView::IndexVecType;
const size_type idx_s = get_index().size();
const size_type rhs_idx_s = rhs.get_index().size();
idxvec_t result_idx;
result_idx.reserve(idx_s + rhs_idx_s);
for (size_type i = 0; i < idx_s; ++i)
result_idx.push_back(&(get_index()[i]));
for (size_type i = 0; i < rhs_idx_s; ++i)
result_idx.push_back(&(rhs.get_index()[i]));
result.indices_ = std::move(result_idx);
if (cp == concat_policy::all_columns) {
for (const auto &lhs_citer : column_list_) {
concat_load_view_functor_<ConstPtrView, Ts ...> functor(
lhs_citer.first.c_str(), result);
data_[lhs_citer.second].change(functor);
}
for (const auto &rhs_citer : rhs.column_list_) {
concat_load_view_functor_<ConstPtrView, Ts ...> functor(
rhs_citer.first.c_str(), result);
rhs.data_[rhs_citer.second].change(functor);
}
}
else if (cp == concat_policy::lhs_and_common_columns) {
for (const auto &lhs_citer : column_list_) {
concat_load_view_functor_<ConstPtrView, Ts ...> functor(
lhs_citer.first.c_str(), result);
data_[lhs_citer.second].change(functor);
auto rhs_citer = rhs.column_tb_.find(lhs_citer.first);
if (rhs_citer != rhs.column_tb_.end())
rhs.data_[rhs_citer->second].change(functor);
}
}
else if (cp == concat_policy::common_columns) {
for (const auto &lhs_citer : column_list_) {
concat_load_view_functor_<ConstPtrView, Ts ...> functor(
lhs_citer.first.c_str(), result);
auto rhs_citer =
rhs.column_tb_.find(lhs_citer.first);
if (rhs_citer != rhs.column_tb_.end()) {
data_[lhs_citer.second].change(functor);
rhs.data_[rhs_citer->second].change(functor);
}
}
}
return (result);
}
} // namespace hmdf
// ----------------------------------------------------------------------------
// Local Variables:
// mode:C++
// tab-width:4
// c-basic-offset:4
// End:
from dataframe.
Thanks you for looking into this.
Why don't you submit this as a PR (pull request)? That way it goes through all the testes and you get the credit for it, if you care about that
from dataframe.
from dataframe.
from dataframe.
I looked at this further. I don't think this is a bug. It should work properly.
But I could be wrong. Can you show me through an example this is a bug?
Thanks
from dataframe.
Say you're sorry,My apologies for the late reply.
This is a sample:
/*******************************************/
#include <DataFrame/DataFrame.h> // Main DataFrame header
#include <DataFrame/DataFrameFinancialVisitors.h> // Financial algorithms
#include <DataFrame/DataFrameMLVisitors.h> // Machine-learning algorithms
#include <DataFrame/DataFrameStatsVisitors.h> // Statistical algorithms
#include <DataFrame/Utils/DateTime.h> // Cool and handy date-time object
using namespace hmdf;
// A DataFrame with ulong index type
//
using ULDataFrame = StdDataFrame;
// A DataFrame with string index type
//
using StrDataFrame = StdDataFramestd::string;
// A DataFrame with DateTime index type
//
using DTDataFrame = StdDataFrame;
void test_index_left_join() {
using MyDataFrame = ULDataFrame;
std::cout << "\nTesting Index Left Join ..." << std::endl;
std::vector<unsigned long> idx =
{ 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
std::vector<double> djoincol1 = { 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 14 };
std::vector<double> d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
std::vector<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
std::vector<double> d3 = { 15, 16, 15, 18, 19, 16, 21, 0.34, 1.56, 0.34, 2.3, 0.34, 19.0 };
std::vector<int> i1 = { 22, 23, 24, 25, 99 };
MyDataFrame df;
df.load_data(std::move(idx),
std::make_pair("djoincol", djoincol1),
std::make_pair("col_1", d1),
std::make_pair("col_2", d2),
std::make_pair("col_3", d3),
std::make_pair("col_4", i1));
std::vector<unsigned long> idx2 =
{1 };
std::vector<double> djoincol2 = { 1 };
std::vector<double> d12 = { 11 };
std::vector<double> d22 = { 18 };
std::vector<double> d32 = { 115 };
std::vector<int> i12 = { 122 };
MyDataFrame df2;
df2.load_data(std::move(idx2),
std::make_pair("djoincol", djoincol2),
std::make_pair("xcol_1", d12),
std::make_pair("col_2", d22),
std::make_pair("xcol_3", d32),
std::make_pair("col_4", i12));
std::cout << "First DF:" << std::endl;
df.write<std::ostream, double, int>(std::cout);
std::cout << "Second DF2:" << std::endl;
df2.write<std::ostream, double, int>(std::cout);
auto join_df =
df.join_by_column<decltype(df2), double, int>(df2,"djoincol", hmdf::join_policy::left_join);
std::cout << "Now The joined DF:" << std::endl;
join_df.write<std::ostream, double, int>(std::cout);
}
write out:
Testing Index Left Join ...
First DF:
INDEX:14::123450,123451,123452,123453,123454,123455,123456,123457,123458,123459,123460,123461,123462,123466,
djoincol:14::1,1,1,1,1,1,1,1,1,1,1,1,1,14,
col_1:14::1,2,3,4,5,6,7,8,9,10,11,12,13,14,
col_2:14::8,9,10,11,12,13,14,20,22,23,30,31,32,1.89,
col_3:14::15,16,15,18,19,16,21,0.34,1.56,0.34,2.3,0.34,19,nan,
col_4:14::22,23,24,25,99,0,0,0,0,0,0,0,0,0,
Second DF2:
INDEX:1::1,
djoincol:1::1,
xcol_1:1::11,
col_2:1::18,
xcol_3:1::115,
col_4:1::122,
Now The joined DF:
INDEX:14::0,1,2,3,4,5,6,7,8,9,10,11,12,13,
djoincol:14::1,1,1,1,1,1,1,1,1,1,1,1,1,14,
lhs.col_4:14::22,23,24,25,99,0,0,0,0,0,0,0,0,0,
rhs.col_4:14::122,0,0,0,0,0,0,0,0,0,0,0,0,0,
Should be:
djoincol:14::1,1,1,1,1,1,1,1,1,1,1,1,1,14,
lhs.col_4:14::22,23,24,25,99,0,0,0,0,0,0,0,0,0,
rhs.col_4:14::122,122,122,122,122,122,122,122,122,122,122,122,122,nan,
from dataframe.
I am a bit confused. The code that your are saying is wrong and you corrected (line# 577 in file DataFrame_join.tcc) is for the left right
join (aka merge). The code example above is for left
outer join. In other words, the example above doesn't execute the code you corrected.
from dataframe.
The similar bug more than one. I post the fix code, and comments by : "//add this row to fix".
from dataframe.
Can you please submit a pull request?
I am still confused. Your comments //add this row to fix
are only in two functions
get_inner_index_idx_vector_()
get_left_right_index_idx_vector_()
Neither of these functions is executed in the code sample you posted in test_index_left_join()
from dataframe.
get_left_right_index_idx_vector_:
while (lhs_current != lhs_end || rhs_current != rhs_end) {
.......
if (*(col_vec_lhs[lhs_current].first) <
(col_vec_rhs[rhs_current].first)) {
joined_index_idx.emplace_back(
col_vec_lhs[lhs_current++].second,
std::numeric_limits<size_type>::max());
}
else {
if ((col_vec_lhs[lhs_current].first) ==
(col_vec_rhs[rhs_current].first))
joined_index_idx.emplace_back(col_vec_lhs[lhs_current++].second, // here ,left record move to next*
col_vec_rhs[rhs_current].second);
else
{ //add this row to fix
joined_index_idx.emplace_back(
std::numeric_limits<size_type>::max(),
col_vec_rhs[rhs_current].second);
//****** if no “{” "}", next line will run every time. if next left record eq rhs_current,and rhs_current will move to next. *********
rhs_current += 1;
} //add this row to fix
}
}
from dataframe.
Thank you for looking into this. But I believe the original behavior/code is correct. making your changes will introduce the bug of repeating the RHS values where they shouldn't be there
Also your code sample above is missing some type specifications. The correct code is:
static void test_index_left_join() {
using MyDataFrame = ULDataFrame;
std::cout << "\nTesting Index Left Join ..." << std::endl;
std::vector<unsigned long> idx =
{ 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
std::vector<double> djoincol1 = { 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 14 };
std::vector<double> d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
std::vector<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
std::vector<double> d3 = { 15, 16, 15, 18, 19, 16, 21, 0.34, 1.56, 0.34, 2.3, 0.34, 19.0 };
std::vector<int> i1 = { 22, 23, 24, 25, 99 };
MyDataFrame df;
df.load_data(std::move(idx),
std::make_pair("djoincol", djoincol1),
std::make_pair("col_1", d1),
std::make_pair("col_2", d2),
std::make_pair("col_3", d3),
std::make_pair("col_4", i1));
std::vector<unsigned long> idx2 = {1 };
std::vector<double> djoincol2 = { 1 };
std::vector<double> d12 = { 11 };
std::vector<double> d22 = { 18 };
std::vector<double> d32 = { 115 };
std::vector<int> i12 = { 122 };
MyDataFrame df2;
df2.load_data(std::move(idx2),
std::make_pair("djoincol", djoincol2),
std::make_pair("xcol_1", d12),
std::make_pair("col_2", d22),
std::make_pair("xcol_3", d32),
std::make_pair("col_4", i12));
std::cout << "First DF:" << std::endl;
df.write<std::ostream, double, int>(std::cout, io_format::csv2);
std::cout << "Second DF2:" << std::endl;
df2.write<std::ostream, double, int>(std::cout, io_format::csv2);
auto join_df =
df.join_by_column<decltype(df2), double, double, int>(df2, "djoincol", hmdf::join_policy::left_join);
std::cout << "Now The joined DF:" << std::endl;
join_df.write<std::ostream, double, int, unsigned long>(std::cout, io_format::csv2);
}
from dataframe.
/***********left table:*******************************/
std::vector
djoincol1 = { 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 14 };
std::vector d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
/***********right table:/
djoincol1 = { 1}
std::vector d2 = {2}
/***********right table:/
join on left.djoincol1 =right.djoincol1
I think result should be:
d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
d2 = { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, nan };
from dataframe.
I believe the left join result should be
d2 = { 2, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan };
The rhs has only one 2
value and if you repeat them it means you are manufacturing data
you can replicate this in a relational database like Postgres
from dataframe.
Related Issues (20)
- Can't get_view HOT 3
- Is there any way of converting epoch to DateTime? HOT 2
- make command failed HOT 8
- Mybe Jan-2023 release source code is not correct 1.22 version? HOT 2
- Sort failing giving Segmentation Fault HOT 3
- Adding support for reset_index() HOT 1
- I hope to support some string operations eg: HOT 1
- Any efficient way to replace a particular column value( for example by index) without replacing all column values? HOT 2
- Does get_column return rows in index order or dataframe order? HOT 2
- Adding support for sort column by absolute value. HOT 2
- Unable to compile code using cmake HOT 4
- append row and visitor calculate unexpected HOT 2
- No write to file example HOT 4
- Dataframe length HOT 2
- Plausibility of adapting for C++17 in a fork? HOT 2
- How to filter the DataFrame? HOT 1
- conflict with include <windows.h> HOT 1
- Aggregate visitors can't be used in groupby HOT 3
- Error: specializing member ‘hmdf::DataFrame<int, hmdf::HeteroVector<0> >::set_lock’ requires ‘template<>’ syntax| HOT 9
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from dataframe.