Huge slow down moving code to c++/xtensor vs python/numpy for the same algorithm (using xsimd changes nothing)

I am attempting to move an implementation of a viterbi inference algorithm written in python/numpy to c++/xtensor to get a speed boost. For almost exactly the same implementation for both, the python version completes the main function in 0.35 seconds while the xtensor implementation takes 19 seconds with or without using xsimd. I am struggling to understand how such a large slow down can occur and how to get around this?

For reference, here is the python implementation showing the main function running in 0.35 seconds:
```
class viterbi():
    def __init__(self, N, T, trunc, aBl, aDl, aDsl, log_trans_matrix, initial_state_potential):
        self.N = N
        self.T = T
        self.trunc = trunc
        self.aBl = aBl
        self.aDl = aDl
        self.aDsl = aDsl
        self.log_trans_matrix = log_trans_matrix
        self.initial_state_potential = initial_state_potential
    
    def cumulative_obs_potentials(self,t):
        stop = None if self.trunc is None else min(self.T,t+self.trunc)
        return np.cumsum(self.aBl[t:stop],axis=0), 0.
    
    def dur_potentials(self,t):
        stop = self.T-t if self.trunc is None else min(self.T-t,self.trunc)
        return self.aDl[:stop]
    
    def dur_survival_potentials(self,t):
        return self.aDsl[self.T-t -1] if (self.trunc is None or self.T-t > self.trunc) \
                else -np.inf
    
    def trans_potentials(self,t):
        return self.log_trans_matrix
    
    @staticmethod
    def hsmm_maximizing_assignment(N, T, trans_potentials, initial_state_potential, cumulative_obs_potentials, dur_potentials, dur_survival_potentials, right_censoring=True):
        beta_scores, beta_args = np.empty((T,N)), np.empty((T,N),dtype=np.int)
        betastar_scores, betastar_args = np.empty((T,N)), np.empty((T,N),dtype=np.int)

        beta_scores[-1] = 0.
        for t in range(T-1,-1,-1):
            cB, offset = cumulative_obs_potentials(t)
            
            vals = beta_scores[t:t+cB.shape[0]] + cB + dur_potentials(t)
            if right_censoring:
                vals = np.vstack((vals,cB[-1] + dur_survival_potentials(t)))
            vals -= offset

            vals.max(axis=0,out=betastar_scores[t])
            vals.argmax(axis=0,out=betastar_args[t])

            vals = betastar_scores[t] + trans_potentials(t-1)

            vals.max(axis=1,out=beta_scores[t-1])
            vals.argmax(axis=1,out=beta_args[t-1])
        
        beta_scores[-1] = 0.

        stateseq = np.empty(T,dtype='int32')
    
        t = 0
        state = (betastar_scores[t] + initial_state_potential).argmax()
        dur = betastar_args[t,state]
        stateseq[t:t+dur] = state
        t += dur
        while t < T:
            state = beta_args[t-1,state]
            dur = betastar_args[t,state] + 1
            stateseq[t:t+dur] = state
            t += dur

        return stateseq

v = viterbi(N, T, trunc, aBl, aDl, aDsl, log_trans_matrix, initial_state_potential)

t1 = time.time()
stateseq = v.hsmm_maximizing_assignment(v.N,
                                        v.T,
                                        v.trans_potentials,
                                        v.initial_state_potential,
                                        v.cumulative_obs_potentials,
                                        v.dur_potentials,
                                        v.dur_survival_potentials
                                       )
print(time.time()-t1)

0.352541446685791
```

Here is my CMakeLists.txt, as stated before, adding or removing the xsimd lines does not change performance:
```
cmake_minimum_required(VERSION 3.0.0)
project(viterbi VERSION 0.1.0)

include(CTest)
enable_testing()

add_definitions(-DXTENSOR_ENABLE_XSIMD)
add_definitions(-DXTENSOR_USE_XSIMD)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -mavx2 -ffast-math")

set(CMAKE_PREFIX_PATH "/home/jberesford/anaconda3/envs/cling")
add_executable(viterbi main.cpp)

set(CPACK_PROJECT_NAME ${PROJECT_NAME})
set(CPACK_PROJECT_VERSION ${PROJECT_VERSION})
include(CPack)

set(XTENSOR_USE_XSIMD 1)

find_package(xtensor REQUIRED)
target_include_directories(viterbi PUBLIC ${xtensor_INCLUDE_DIRS})
target_link_libraries(viterbi PUBLIC xtensor)
```

And finally my main.cpp, which is almost exactly the same implementation as in the python version, swapping numpy for xtensor:
```
#include <iostream>
#include <chrono>

#include "xtensor/xtensor.hpp"
#include "xtensor/xio.hpp"
#include "xtensor/xview.hpp"
#include "xtensor/xnpy.hpp"
#include "xtensor/xmath.hpp"
#include "xtensor/xarray.hpp"
#include "xtensor/xsort.hpp"

using namespace xt::placeholders;

// Data
const auto aBl = xt::load_npy<double>("Data/aBl.npy");
const auto aDl = xt::load_npy<double>("Data/aDl.npy");
const auto aDsl = xt::load_npy<double>("Data/aDsl.npy");
const auto trans_potentials = xt::load_npy<double>("Data/log_trans_matrix.npy");
const auto initial_state_potential = xt::load_npy<double>("Data/log_pi_0.npy");

// Global vars
const int T = 6000;
const int N = 25;
const int truncVal = 200;
const auto inf = std::numeric_limits<double>::infinity();
const bool right_censoring = true;

// Functions
// cumulative_obs_potentials
auto cumulative_obs_potentials(int t){
    int stop = std::min(T, t+truncVal);
    return xt::cumsum(xt::view(aBl, xt::range(t,stop), xt::all()), 0);
};
// dur_potentials
auto dur_potentials(int t){
    int stop = std::min(T-t, truncVal);
    return xt::view(aDl, xt::range(_,stop), xt::all());
};
// dur_survival_potentials
auto dur_survival_potentials(int t){
    // if(T-t > truncVal)
    // else -inf
    return xt::view(aDsl, T-t-1, xt::all());
};

int main(int, char**){
    auto time1 = std::chrono::high_resolution_clock::now();
    // Beta initialisations
    xt::xtensor<double,2> beta_scores = xt::empty<double>({T,N});
    xt::xtensor<int,2> beta_args = xt::empty<int>({T,N});
    xt::xtensor<double,2> betastar_scores = xt::empty<double>({T,N});
    xt::xtensor<int,2> betastar_args = xt::empty<int>({T,N});

    // Initialise last value of beta_scores to 0.
    xt::view(beta_scores, beta_scores.shape(0)-1, xt::all()) = 0.;
    for (int t = T-1; t>-1; --t){
        auto cB = cumulative_obs_potentials(t);
        double offset = 0.;
        
        auto vals = xt::eval(xt::view(beta_scores, xt::range(t, t+cB.shape(0)), xt::all()) + cB + dur_potentials(t));
        if (T-t>truncVal){
            vals = xt::vstack(xt::xtuple(vals, xt::view(cB, xt::range(cB.shape(0)-1, cB.shape(0)), xt::all())+dur_survival_potentials(t)));
        } else {
            vals = xt::vstack(xt::xtuple(vals, xt::view(cB, xt::range(cB.shape(0)-1, cB.shape(0)), xt::all()) - inf));
        };

        vals -= offset;

        xt::view(betastar_scores, t, xt::all()) = xt::amax(vals, 0);
        xt::view(betastar_args, t, xt::all()) = xt::argmax(vals, 0);

        vals = xt::eval(xt::view(betastar_scores, t, xt::all())+trans_potentials);

        if (t>0){
            xt::view(beta_scores, t-1, xt::all()) = xt::amax(vals, 1);
            xt::view(beta_args, t-1, xt::all()) = xt::argmax(vals, 1);
        };
    };

    // Initialise last value of beta_scores to 0.
    xt::view(beta_scores, beta_scores.shape(0)-1, xt::all()) = 0.;
    
    xt::xtensor<int,1> stateseq = xt::empty<int>({T});

    int t = 0;
    int state = xt::argmax(xt::view(betastar_scores, t, xt::all()) + initial_state_potential)(0);
    int dur = betastar_args(t, state);
    xt::view(stateseq, xt::range(t, t+dur)) = state;
    t += dur;
    while (t < T){
        state = beta_args(t-1,state);
        dur = betastar_args(t,state) + 1;
        xt::view(stateseq, xt::range(t, t+dur)) = state;
        t += dur;
    };

    auto time2 = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::seconds>( time2 - time1 ).count();
    std::cout << "Time taken (s): " << duration << "\n";
    // Print state sequence
    std::cout << stateseq << "\n";
}

Time taken (s): 19
```

Both versions of the algorithm output the exact same result (stateseq) so I am sure the operations involved match but I am confused as to why xtensor is so much slower than numpy and why using xsimd makes no difference??

Many thanks!

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Huge slow down moving code to c++/xtensor vs python/numpy for the same algorithm (using xsimd changes nothing) #1920

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Huge slow down moving code to c++/xtensor vs python/numpy for the same algorithm (using xsimd changes nothing) #1920

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions