/* @cond INNERDOC */
/*!
 @file
 @brief
 Performance kernels dispatching code, for each type, submatrix size, operation.
 But for block compressed sparse stripes format.
 Kernels unrolled, with no loops, for only user-specified blockings.
 */

/*

Copyright (C) 2008-2020 Michele Martone

This file is part of librsb.

librsb is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
by the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.

librsb is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public
License along with librsb; see the file COPYING.
If not, see <http://www.gnu.org/licenses/>.

*/
/*
 The code in this file was generated automatically by an M4 script. 
 It is not meant to be used as an API (Application Programming Interface).
 p.s.: right now, only row major matrix access is considered.

 */
#include "rsb.h"
#include "rsb_common.h"
#include "rsb_internals.h"


#pragma GCC visibility push(hidden)


rsb_err_t rsb__BCSR_spmv_uaua_double_C__tN_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tN_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_C__tT_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tT_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_C__tC_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tC_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_C__tN_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tN_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_C__tT_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_double_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tT_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_double_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_double_C__tC_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_double_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tC_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_double_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_double_C__tN_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tN_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_C__tT_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_double_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tT_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_double_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_double_C__tC_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_double_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tC_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_double_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_double_C__tN_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tN_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_C__tT_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tT_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_C__tC_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tC_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_C__tN_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tN_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_C__tT_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_double_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tT_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_double_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_double_C__tC_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_double_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tC_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_double_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_double_C__tN_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tN_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_double_C__tT_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_double_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tT_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_double_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_double_C__tC_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_double_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_double_H__tC_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_double_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tN_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
		out[1*(i*1)+0]=0;
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tN_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
		out[1*(i*1)+0]=0;
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tT_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_DOUBLE ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tT_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_DOUBLE ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tC_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_DOUBLE ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tC_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_DOUBLE ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tN_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_DOUBLE ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tN_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_DOUBLE ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tT_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_double_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tT_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_double_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tC_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_double_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tC_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_double_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tN_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_DOUBLE ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tN_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_DOUBLE ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tT_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_double_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tT_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_double_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tC_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_double_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tC_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_double_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tN_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
		out[1*(i*1)+0]=0;
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tN_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
		out[1*(i*1)+0]=0;
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tT_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_DOUBLE ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tT_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_DOUBLE ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tC_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_DOUBLE ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tC_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_DOUBLE ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tN_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_DOUBLE ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tN_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_DOUBLE ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tT_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_double_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tT_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_double_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tC_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_double_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tC_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_double_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tN_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_DOUBLE ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tN_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_DOUBLE ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tT_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_double_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tT_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_double_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_double_C__tC_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_double_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_double_H__tC_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_double_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tN_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tN_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tT_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tT_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tC_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tC_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tN_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tN_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tT_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_double_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tT_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_double_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tC_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_double_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tC_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_double_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tN_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tN_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tT_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_double_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tT_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_double_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tC_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_double_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tC_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_double_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tN_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tN_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tT_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tT_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tC_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tC_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tN_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tN_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tT_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_double_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tT_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_double_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tC_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_double_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tC_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_double_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tN_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tN_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tT_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_double_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tT_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_double_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_double_C__tC_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_double_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_double_H__tC_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_double_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tN_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tN_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tT_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tT_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tC_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tC_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tN_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tN_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tT_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_double_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tT_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_double_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tC_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_double_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tC_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_double_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tN_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tN_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tT_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_double_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tT_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_double_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tC_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_double_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tC_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_double_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tN_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tN_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tT_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tT_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tC_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tC_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tN_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tN_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tT_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_double_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tT_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_double_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tC_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_double_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tC_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_double_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tN_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tN_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+1*(roff-coff);
	double *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*1];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*1];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*1];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*1];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tT_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_double_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tT_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_double_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_double_C__tC_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_double_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_double_H__tC_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_double_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tN_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tN_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tT_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tT_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tC_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tC_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tN_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tN_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tT_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_double_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tT_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_double_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tC_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_double_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tC_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_double_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tN_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tN_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tT_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_double_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tT_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_double_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tC_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_double_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tC_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_double_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tN_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tN_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tT_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tT_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tC_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tC_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tN_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tN_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tT_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_double_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tT_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_double_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tC_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_double_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tC_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_double_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tN_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tN_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tT_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_double_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tT_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_double_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_double_C__tC_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_double_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_double_H__tC_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_double_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tN_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tN_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tT_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tT_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tC_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tC_r1_c1_uu_sU_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tN_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tN_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tT_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_double_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tT_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_double_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tC_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_double_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tC_r1_c1_uu_sS_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_double_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tN_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tN_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tT_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_double_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tT_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_double_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tC_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_double_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tC_r1_c1_uu_sH_dE_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_double_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tN_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tN_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double *a=VA;
		register double cacc = ((double)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tT_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tT_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tC_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tC_r1_c1_uu_sU_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const double bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double a_1 =VA[k+1 ];
			double c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double a_2 =VA[k+2 ];
			double c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double a_3 =VA[k+3 ];
			double c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double a_0 =VA[k+0 ];
			double c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tN_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tN_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tT_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_double_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tT_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_double_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tC_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_double_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tC_r1_c1_uu_sS_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_double_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tN_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tN_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const double alpha=*alphap;	const double *trhs = rhs+(incx)*(roff-coff);
	double *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register double cacc = ((double)(0));
		const double bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const double b_1 =rhs[1*(j_1 )*(incx)];
			const double a_1 =VA[k+1 ];
			double c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const double b_2 =rhs[1*(j_2 )*(incx)];
			const double a_2 =VA[k+2 ];
			double c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const double b_3 =rhs[1*(j_3 )*(incx)];
			const double a_3 =VA[k+3 ];
			double c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const double b_0 =rhs[1*(j_0 )*(incx)];
			const double a_0 =VA[k+0 ];
			double c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tT_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_double_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tT_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_double_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_C__tC_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_double_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_double_H__tC_r1_c1_uu_sH_dI_uG(const double * restrict VA, const double * restrict rhs, double * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const double * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type double, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_double_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tN_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tN_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tT_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tT_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tC_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tC_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tN_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tN_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tT_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_float_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tT_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_float_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tC_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_float_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tC_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_float_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tN_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tN_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tT_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_float_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tT_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_float_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tC_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_float_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tC_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_float_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tN_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tN_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tT_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tT_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tC_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tC_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tN_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tN_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tT_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_float_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tT_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_float_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tC_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_float_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tC_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_float_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tN_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tN_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tT_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_float_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tT_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_float_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_float_C__tC_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_float_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uaua_float_H__tC_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uaua_float_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tN_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
		out[1*(i*1)+0]=0;
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tN_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
		out[1*(i*1)+0]=0;
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tT_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tT_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tC_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tC_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tN_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tN_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tT_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_float_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tT_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_float_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tC_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_float_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tC_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_float_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tN_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tN_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tT_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_float_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tT_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_float_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tC_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_float_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tC_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_float_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tN_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
		out[1*(i*1)+0]=0;
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tN_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
		out[1*(i*1)+0]=0;
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tT_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tT_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tC_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tC_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tN_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tN_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tT_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_float_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tT_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_float_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tC_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_float_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tC_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_float_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tN_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tN_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tT_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_float_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tT_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_float_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_float_C__tC_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_float_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uauz_float_H__tC_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uauz_float_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tN_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tN_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tT_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tT_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tC_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tC_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tN_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tN_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tT_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_float_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tT_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_float_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tC_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_float_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tC_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_float_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tN_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tN_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tT_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_float_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tT_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_float_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tC_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_float_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tC_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_float_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tN_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tN_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tT_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tT_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tC_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tC_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tN_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tN_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tT_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_float_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tT_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_float_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tC_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_float_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tC_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_float_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tN_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tN_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tT_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_float_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tT_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_float_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_float_C__tC_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_float_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_uxua_float_H__tC_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_uxua_float_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap);
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tN_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tN_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tT_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tT_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tC_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tC_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tN_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tN_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tT_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_float_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tT_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_float_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tC_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_float_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tC_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_float_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tN_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tN_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tT_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_float_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tT_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_float_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tC_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_float_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tC_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_float_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tN_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tN_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tT_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tT_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tC_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tC_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tN_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tN_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tT_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_float_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tT_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_float_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tC_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_float_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tC_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_float_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tN_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tN_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+1*(roff-coff);
	float *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(-1)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*1];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*1];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*1];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*1];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(-1)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tT_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_float_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tT_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_float_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_float_C__tC_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_float_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_unua_float_H__tC_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y - {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_unua_float_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags);
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tN_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tN_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tT_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tT_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tC_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tC_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tN_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tN_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tT_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_float_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tT_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_float_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tC_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_float_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tC_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_float_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tN_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tN_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tT_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_float_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tT_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_float_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tC_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_float_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tC_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_float_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tN_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tN_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tT_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tT_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tC_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tC_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tN_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tN_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tT_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_float_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tT_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_float_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tC_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_float_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tC_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_float_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tN_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tN_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tT_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_float_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tT_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_float_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_float_C__tC_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_float_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sasa_float_H__tC_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sasa_float_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tN_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tN_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tT_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tT_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tC_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tC_r1_c1_uu_sU_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tN_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tN_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tT_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_float_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tT_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_float_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tC_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_float_C__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tC_r1_c1_uu_sS_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_float_H__tN_r1_c1_uu_sS_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tN_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tN_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tT_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_float_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tT_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_float_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tC_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_float_C__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tC_r1_c1_uu_sH_dE_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_float_H__tN_r1_c1_uu_sH_dE_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tN_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tN_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float *a=VA;
		register float cacc = ((float)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tT_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tT_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tC_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tC_r1_c1_uu_sU_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float a_1 =VA[k+1 ];
			float c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float a_2 =VA[k+2 ];
			float c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float a_3 =VA[k+3 ];
			float c_3 =a_3 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float a_0 =VA[k+0 ];
			float c_0 =a_0 *bt;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tN_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tN_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tT_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_float_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tT_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_float_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tC_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_float_C__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tC_r1_c1_uu_sS_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_float_H__tN_r1_c1_uu_sS_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tN_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tN_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float alpha=*alphap;	const float *trhs = rhs+(incx)*(roff-coff);
	float *tout=out+(incy)*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float cacc = ((float)(0));
		const float bt=(alpha)*trhs[(1*(incx)*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*(incx)];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*(incy)]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float b_1 =rhs[1*(j_1 )*(incx)];
			const float a_1 =VA[k+1 ];
			float c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float b_2 =rhs[1*(j_2 )*(incx)];
			const float a_2 =VA[k+2 ];
			float c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float b_3 =rhs[1*(j_3 )*(incx)];
			const float a_3 =VA[k+3 ];
			float c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*(incy)]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*(incy)]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*(incy)]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float b_0 =rhs[1*(j_0 )*(incx)];
			const float a_0 =VA[k+0 ];
			float c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*(incy)]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*(incx)];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*(incy)]+=VA[k]*bt;
				++k;
			}
			out[(1*i*(incy))]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tT_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_float_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tT_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_float_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_C__tC_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_float_C__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_sxsa_float_H__tC_r1_c1_uu_sH_dI_uG(const float * restrict VA, const float * restrict rhs, float * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float * restrict alphap,rsb_coo_idx_t incx, rsb_coo_idx_t incy)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
	 * with incx and incy as x and y vector strides
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	/* Symmetric transposed reverts to symmetric not transposed */
	return rsb__BCSR_spmv_sxsa_float_H__tN_r1_c1_uu_sH_dI_uG(VA,rhs,out,Mdim,mdim,bindx,bpntr,indptr,rpntr,cpntr,br,bc,roff,coff,flags,alphap,incx,incy);
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tN_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex *a=VA;
		register float complex cacc = ((float complex)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tN_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex *a=VA;
		register float complex cacc = ((float complex)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tT_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tT_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tC_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =conjf(VA[k+1 ]);
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =conjf(VA[k+2 ]);
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =conjf(VA[k+3 ]);
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tC_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =conjf(VA[k+1 ]);
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =conjf(VA[k+2 ]);
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =conjf(VA[k+3 ]);
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tN_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tN_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tT_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tT_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tC_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tC_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tN_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tN_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tT_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tT_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tC_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tC_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tN_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex *a=VA;
		register float complex cacc = ((float complex)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tN_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex *a=VA;
		register float complex cacc = ((float complex)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tT_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tT_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tC_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =conjf(VA[k+1 ]);
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =conjf(VA[k+2 ]);
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =conjf(VA[k+3 ]);
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tC_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =conjf(VA[k+1 ]);
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =conjf(VA[k+2 ]);
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =conjf(VA[k+3 ]);
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tN_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tN_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tT_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tT_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tC_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tC_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tN_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tN_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tT_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tT_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_C__tC_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uaua_float_complex_H__tC_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tN_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float complex *a=VA;
		register float complex cacc = ((float complex)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
		out[1*(i*1)+0]=0;
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tN_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float complex *a=VA;
		register float complex cacc = ((float complex)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
		out[1*(i*1)+0]=0;
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tT_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tT_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tC_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =conjf(VA[k+1 ]);
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =conjf(VA[k+2 ]);
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =conjf(VA[k+3 ]);
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tC_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =conjf(VA[k+1 ]);
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =conjf(VA[k+2 ]);
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =conjf(VA[k+3 ]);
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tN_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tN_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tT_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tT_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tC_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tC_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tN_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tN_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tT_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tT_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tC_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tC_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tN_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float complex *a=VA;
		register float complex cacc = ((float complex)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
		out[1*(i*1)+0]=0;
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tN_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float complex *a=VA;
		register float complex cacc = ((float complex)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
		out[1*(i*1)+0]=0;
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tT_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tT_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tC_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =conjf(VA[k+1 ]);
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =conjf(VA[k+2 ]);
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =conjf(VA[k+3 ]);
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tC_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =conjf(VA[k+1 ]);
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =conjf(VA[k+2 ]);
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =conjf(VA[k+3 ]);
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tN_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tN_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tT_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tT_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tC_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tC_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tN_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tN_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,Mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tT_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tT_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_C__tC_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uauz_float_complex_H__tC_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	rsb__cblas_Xscal(RSB_NUMERICAL_TYPE_FLOAT_COMPLEX ,mdim*1,NULL,out,1);
	for(i=0;RSB_LIKELY(i<Mdim);++i)
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tN_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex *a=VA;
		register float complex cacc = ((float complex)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tN_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex *a=VA;
		register float complex cacc = ((float complex)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tT_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tT_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tC_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =conjf(VA[k+1 ]);
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =conjf(VA[k+2 ]);
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =conjf(VA[k+3 ]);
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tC_r1_c1_uu_sU_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =conjf(VA[k+1 ]);
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =conjf(VA[k+2 ]);
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =conjf(VA[k+3 ]);
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tN_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tN_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tT_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tT_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tC_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tC_r1_c1_uu_sS_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tN_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tN_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tT_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tT_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tC_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tC_r1_c1_uu_sH_dE_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal explicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tN_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex *a=VA;
		register float complex cacc = ((float complex)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tN_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex *a=VA;
		register float complex cacc = ((float complex)(0));
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =a[k+1 ];
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =a[k+2 ];
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =a[k+3 ];
			cacc+=a_0 *b_0 ;
			cacc+=a_1 *b_1 ;
			cacc+=a_2 *b_2 ;
			cacc+=a_3 *b_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =a[k+0 ];
			cacc+=a_0 *b_0 ;
}
}

			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tT_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tT_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tC_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =conjf(VA[k+1 ]);
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =conjf(VA[k+2 ]);
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =conjf(VA[k+3 ]);
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tC_r1_c1_uu_sU_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A \neq A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	register rsb_coo_idx_t i=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
{
for(k=fk;k+3<lk;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex a_1 =conjf(VA[k+1 ]);
			float complex c_1 =a_1 *bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex a_2 =conjf(VA[k+2 ]);
			float complex c_2 =a_2 *bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex a_3 =conjf(VA[k+3 ]);
			float complex c_3 =a_3 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex a_0 =conjf(VA[k+0 ]);
			float complex c_0 =a_0 *bt;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tN_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tN_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tT_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tT_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tC_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tC_r1_c1_uu_sS_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^T. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tN_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tN_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tT_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tT_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^T} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += conjf(VA[k])*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=VA[k]*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =( a_3 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += conjf(a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += conjf(a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += conjf(a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =( a_0 )*bt;
			cacc += conjf(a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += conjf(VA[k])*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=VA[k]*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_C__tC_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_coo_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_coo_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs = rhs+1*(roff-coff);
	float complex *tout=out+1*(coff-roff);

	for(i=br;RSB_LIKELY(i<bc);++i)	/* experimental, for the bounded box patch */
	{
		register float complex cacc = ((float complex)(0));
		const float complex bt=(alpha)*trhs[(1*1*(i))];
		const rsb_nnz_idx_t fk=bpntr[i],lk=bpntr[i+1];
			k=fk;
			if(k==lk)continue;
			j=bindx[k];
			cacc += VA[k]*rhs[1*j*1];
			if(roff!=coff || (j!=i))
				tout[(1)*(j)*1]+=conjf(VA[k])*bt;
			++k;
{
for(k=fk+1;k+3<lk-1;k+=4){
			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			const rsb_coo_idx_t j_1 =bindx[k+1 ];
			const float complex b_1 =rhs[1*(j_1 )*1];
			const float complex a_1 =VA[k+1 ];
			float complex c_1 =conjf( a_1 )*bt;
			const rsb_coo_idx_t j_2 =bindx[k+2 ];
			const float complex b_2 =rhs[1*(j_2 )*1];
			const float complex a_2 =VA[k+2 ];
			float complex c_2 =conjf( a_2 )*bt;
			const rsb_coo_idx_t j_3 =bindx[k+3 ];
			const float complex b_3 =rhs[1*(j_3 )*1];
			const float complex a_3 =VA[k+3 ];
			float complex c_3 =conjf( a_3 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
			cacc += (a_1 )*b_1 ;
			tout[(1)*(j_1 )*1]+=c_1 ;
			cacc += (a_2 )*b_2 ;
			tout[(1)*(j_2 )*1]+=c_2 ;
			cacc += (a_3 )*b_3 ;
			tout[(1)*(j_3 )*1]+=c_3 ;
}
for(     ;k<lk-1;++k){			const rsb_coo_idx_t j_0 =bindx[k+0 ];
			const float complex b_0 =rhs[1*(j_0 )*1];
			const float complex a_0 =VA[k+0 ];
			float complex c_0 =conjf( a_0 )*bt;
			cacc += (a_0 )*b_0 ;
			tout[(1)*(j_0 )*1]+=c_0 ;
}
}

			if(k<lk)
			{
				j=bindx[k];
				cacc += VA[k]*rhs[1*j*1];
				if(roff!=coff || (j!=i))
					tout[(1)*(j)*1]+=conjf(VA[k])*bt;
				++k;
			}
			out[(1*i*1)]+=(alpha)*cacc;
	}

	return RSB_ERR_NO_ERROR;
}



rsb_err_t rsb__BCSR_spmv_uxua_float_complex_H__tC_r1_c1_uu_sH_dI_uG(const float complex * restrict VA, const float complex * restrict rhs, float complex * restrict out, const rsb_coo_idx_t  Mdim,const rsb_coo_idx_t  mdim,const rsb_half_idx_t * restrict bindx,const rsb_nnz_idx_t * restrict bpntr,const rsb_nnz_idx_t *restrict indptr,const rsb_coo_idx_t * restrict rpntr,const rsb_coo_idx_t * restrict cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const float complex * restrict alphap)
{

	/**
	 * \ingroup rsb_doc_kernels
	 * Computes \f$y \leftarrow y + \alpha \cdot {A^H} \cdot x, where A == A^H. \f$
         * Matrix A should be blocked 1 x 1, stored in BCSR format, diagonal implicit, of type float complex, with rsb_half_idx_t column indices.
	 * \return \rsb_errval_inp_param_msg
	 */

	/* NOTE: Diagonal implicit is not really handled here: look at caller level. */
	/*
	*/
	register rsb_coo_idx_t i=0,j=0;
	register rsb_nnz_idx_t k=0;
	const float complex alpha=*alphap;	const float complex *trhs