
Description
I am attempting to move an implementation of a viterbi inference algorithm written in python/numpy to c++/xtensor to get a speed boost. For almost exactly the same implementation for both, the python version completes the main function in 0.35 seconds while the xtensor implementation takes 19 seconds with or without using xsimd. I am struggling to understand how such a large slow down can occur and how to get around this?
For reference, here is the python implementation showing the main function running in 0.35 seconds:
class viterbi():
def __init__(self, N, T, trunc, aBl, aDl, aDsl, log_trans_matrix, initial_state_potential):
self.N = N
self.T = T
self.trunc = trunc
self.aBl = aBl
self.aDl = aDl
self.aDsl = aDsl
self.log_trans_matrix = log_trans_matrix
self.initial_state_potential = initial_state_potential
def cumulative_obs_potentials(self,t):
stop = None if self.trunc is None else min(self.T,t+self.trunc)
return np.cumsum(self.aBl[t:stop],axis=0), 0.
def dur_potentials(self,t):
stop = self.T-t if self.trunc is None else min(self.T-t,self.trunc)
return self.aDl[:stop]
def dur_survival_potentials(self,t):
return self.aDsl[self.T-t -1] if (self.trunc is None or self.T-t > self.trunc) \
else -np.inf
def trans_potentials(self,t):
return self.log_trans_matrix
@staticmethod
def hsmm_maximizing_assignment(N, T, trans_potentials, initial_state_potential, cumulative_obs_potentials, dur_potentials, dur_survival_potentials, right_censoring=True):
beta_scores, beta_args = np.empty((T,N)), np.empty((T,N),dtype=np.int)
betastar_scores, betastar_args = np.empty((T,N)), np.empty((T,N),dtype=np.int)
beta_scores[-1] = 0.
for t in range(T-1,-1,-1):
cB, offset = cumulative_obs_potentials(t)
vals = beta_scores[t:t+cB.shape[0]] + cB + dur_potentials(t)
if right_censoring:
vals = np.vstack((vals,cB[-1] + dur_survival_potentials(t)))
vals -= offset
vals.max(axis=0,out=betastar_scores[t])
vals.argmax(axis=0,out=betastar_args[t])
vals = betastar_scores[t] + trans_potentials(t-1)
vals.max(axis=1,out=beta_scores[t-1])
vals.argmax(axis=1,out=beta_args[t-1])
beta_scores[-1] = 0.
stateseq = np.empty(T,dtype='int32')
t = 0
state = (betastar_scores[t] + initial_state_potential).argmax()
dur = betastar_args[t,state]
stateseq[t:t+dur] = state
t += dur
while t < T:
state = beta_args[t-1,state]
dur = betastar_args[t,state] + 1
stateseq[t:t+dur] = state
t += dur
return stateseq
v = viterbi(N, T, trunc, aBl, aDl, aDsl, log_trans_matrix, initial_state_potential)
t1 = time.time()
stateseq = v.hsmm_maximizing_assignment(v.N,
v.T,
v.trans_potentials,
v.initial_state_potential,
v.cumulative_obs_potentials,
v.dur_potentials,
v.dur_survival_potentials
)
print(time.time()-t1)
0.352541446685791
Here is my CMakeLists.txt, as stated before, adding or removing the xsimd lines does not change performance:
cmake_minimum_required(VERSION 3.0.0)
project(viterbi VERSION 0.1.0)
include(CTest)
enable_testing()
add_definitions(-DXTENSOR_ENABLE_XSIMD)
add_definitions(-DXTENSOR_USE_XSIMD)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -mavx2 -ffast-math")
set(CMAKE_PREFIX_PATH "/home/jberesford/anaconda3/envs/cling")
add_executable(viterbi main.cpp)
set(CPACK_PROJECT_NAME ${PROJECT_NAME})
set(CPACK_PROJECT_VERSION ${PROJECT_VERSION})
include(CPack)
set(XTENSOR_USE_XSIMD 1)
find_package(xtensor REQUIRED)
target_include_directories(viterbi PUBLIC ${xtensor_INCLUDE_DIRS})
target_link_libraries(viterbi PUBLIC xtensor)
And finally my main.cpp, which is almost exactly the same implementation as in the python version, swapping numpy for xtensor:
#include <iostream>
#include <chrono>
#include "xtensor/xtensor.hpp"
#include "xtensor/xio.hpp"
#include "xtensor/xview.hpp"
#include "xtensor/xnpy.hpp"
#include "xtensor/xmath.hpp"
#include "xtensor/xarray.hpp"
#include "xtensor/xsort.hpp"
using namespace xt::placeholders;
// Data
const auto aBl = xt::load_npy<double>("Data/aBl.npy");
const auto aDl = xt::load_npy<double>("Data/aDl.npy");
const auto aDsl = xt::load_npy<double>("Data/aDsl.npy");
const auto trans_potentials = xt::load_npy<double>("Data/log_trans_matrix.npy");
const auto initial_state_potential = xt::load_npy<double>("Data/log_pi_0.npy");
// Global vars
const int T = 6000;
const int N = 25;
const int truncVal = 200;
const auto inf = std::numeric_limits<double>::infinity();
const bool right_censoring = true;
// Functions
// cumulative_obs_potentials
auto cumulative_obs_potentials(int t){
int stop = std::min(T, t+truncVal);
return xt::cumsum(xt::view(aBl, xt::range(t,stop), xt::all()), 0);
};
// dur_potentials
auto dur_potentials(int t){
int stop = std::min(T-t, truncVal);
return xt::view(aDl, xt::range(_,stop), xt::all());
};
// dur_survival_potentials
auto dur_survival_potentials(int t){
// if(T-t > truncVal)
// else -inf
return xt::view(aDsl, T-t-1, xt::all());
};
int main(int, char**){
auto time1 = std::chrono::high_resolution_clock::now();
// Beta initialisations
xt::xtensor<double,2> beta_scores = xt::empty<double>({T,N});
xt::xtensor<int,2> beta_args = xt::empty<int>({T,N});
xt::xtensor<double,2> betastar_scores = xt::empty<double>({T,N});
xt::xtensor<int,2> betastar_args = xt::empty<int>({T,N});
// Initialise last value of beta_scores to 0.
xt::view(beta_scores, beta_scores.shape(0)-1, xt::all()) = 0.;
for (int t = T-1; t>-1; --t){
auto cB = cumulative_obs_potentials(t);
double offset = 0.;
auto vals = xt::eval(xt::view(beta_scores, xt::range(t, t+cB.shape(0)), xt::all()) + cB + dur_potentials(t));
if (T-t>truncVal){
vals = xt::vstack(xt::xtuple(vals, xt::view(cB, xt::range(cB.shape(0)-1, cB.shape(0)), xt::all())+dur_survival_potentials(t)));
} else {
vals = xt::vstack(xt::xtuple(vals, xt::view(cB, xt::range(cB.shape(0)-1, cB.shape(0)), xt::all()) - inf));
};
vals -= offset;
xt::view(betastar_scores, t, xt::all()) = xt::amax(vals, 0);
xt::view(betastar_args, t, xt::all()) = xt::argmax(vals, 0);
vals = xt::eval(xt::view(betastar_scores, t, xt::all())+trans_potentials);
if (t>0){
xt::view(beta_scores, t-1, xt::all()) = xt::amax(vals, 1);
xt::view(beta_args, t-1, xt::all()) = xt::argmax(vals, 1);
};
};
// Initialise last value of beta_scores to 0.
xt::view(beta_scores, beta_scores.shape(0)-1, xt::all()) = 0.;
xt::xtensor<int,1> stateseq = xt::empty<int>({T});
int t = 0;
int state = xt::argmax(xt::view(betastar_scores, t, xt::all()) + initial_state_potential)(0);
int dur = betastar_args(t, state);
xt::view(stateseq, xt::range(t, t+dur)) = state;
t += dur;
while (t < T){
state = beta_args(t-1,state);
dur = betastar_args(t,state) + 1;
xt::view(stateseq, xt::range(t, t+dur)) = state;
t += dur;
};
auto time2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::seconds>( time2 - time1 ).count();
std::cout << "Time taken (s): " << duration << "\n";
// Print state sequence
std::cout << stateseq << "\n";
}
Time taken (s): 19
Both versions of the algorithm output the exact same result (stateseq) so I am sure the operations involved match but I am confused as to why xtensor is so much slower than numpy and why using xsimd makes no difference??
Many thanks!