From 608c136d22142cbf0f1dde4e1a739bc225418c75 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Fri, 21 Mar 2025 19:54:05 +0100 Subject: [PATCH] Initial implementation of Libero backend for PolarFire line of FPGAs --- hls4ml/backends/__init__.py | 2 + hls4ml/backends/libero/__init__.py | 0 hls4ml/backends/libero/libero_backend.py | 192 ++++ hls4ml/backends/libero/libero_types.py | 125 +++ .../backends/libero/passes/core_templates.py | 152 ++++ .../backends/libero/passes/pipeline_style.py | 101 +++ .../backends/libero/passes/transform_types.py | 47 + hls4ml/report/__init__.py | 1 + hls4ml/report/libero_report.py | 117 +++ hls4ml/templates/libero/build_lib.sh | 20 + hls4ml/templates/libero/firmware/defines.h | 19 + .../templates/libero/firmware/myproject.cpp | 19 + hls4ml/templates/libero/firmware/myproject.h | 19 + hls4ml/templates/libero/firmware/parameters.h | 19 + hls4ml/templates/libero/myproject_bridge.cpp | 69 ++ hls4ml/templates/libero/myproject_test.cpp | 96 ++ .../libero/nnet_utils/nnet_activation.h | 800 +++++++++++++++++ .../nnet_utils/nnet_activation_stream.h | 800 +++++++++++++++++ .../libero/nnet_utils/nnet_code_gen.h | 28 + .../templates/libero/nnet_utils/nnet_common.h | 65 ++ .../libero/nnet_utils/nnet_conv1d_latency.h | 167 ++++ .../templates/libero/nnet_utils/nnet_dense.h | 82 ++ .../libero/nnet_utils/nnet_dense_compressed.h | 89 ++ .../libero/nnet_utils/nnet_dense_latency.h | 72 ++ .../libero/nnet_utils/nnet_dense_resource.h | 270 ++++++ .../libero/nnet_utils/nnet_dense_stream.h | 105 +++ .../libero/nnet_utils/nnet_function_stubs.h | 51 ++ .../libero/nnet_utils/nnet_helpers.h | 279 ++++++ .../templates/libero/nnet_utils/nnet_mult.h | 118 +++ .../templates/libero/nnet_utils/nnet_stream.h | 223 +++++ .../templates/libero/nnet_utils/nnet_types.h | 66 ++ hls4ml/writer/__init__.py | 2 + hls4ml/writer/libero_writer.py | 844 ++++++++++++++++++ 33 files changed, 5059 insertions(+) create mode 100644 hls4ml/backends/libero/__init__.py create mode 100644 hls4ml/backends/libero/libero_backend.py create mode 100644 hls4ml/backends/libero/libero_types.py create mode 100644 hls4ml/backends/libero/passes/core_templates.py create mode 100644 hls4ml/backends/libero/passes/pipeline_style.py create mode 100644 hls4ml/backends/libero/passes/transform_types.py create mode 100644 hls4ml/report/libero_report.py create mode 100644 hls4ml/templates/libero/build_lib.sh create mode 100644 hls4ml/templates/libero/firmware/defines.h create mode 100644 hls4ml/templates/libero/firmware/myproject.cpp create mode 100644 hls4ml/templates/libero/firmware/myproject.h create mode 100644 hls4ml/templates/libero/firmware/parameters.h create mode 100644 hls4ml/templates/libero/myproject_bridge.cpp create mode 100644 hls4ml/templates/libero/myproject_test.cpp create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_activation.h create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_activation_stream.h create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_code_gen.h create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_common.h create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_conv1d_latency.h create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_dense.h create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_dense_compressed.h create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_dense_latency.h create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_dense_resource.h create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_dense_stream.h create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_function_stubs.h create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_helpers.h create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_mult.h create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_stream.h create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_types.h create mode 100644 hls4ml/writer/libero_writer.py diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py index 4a48f072cd..ea1b53b392 100644 --- a/hls4ml/backends/__init__.py +++ b/hls4ml/backends/__init__.py @@ -1,5 +1,6 @@ from hls4ml.backends.backend import Backend, get_available_backends, get_backend, register_backend # noqa: F401 from hls4ml.backends.fpga.fpga_backend import FPGABackend # noqa: F401 +from hls4ml.backends.libero.libero_backend import LiberoBackend from hls4ml.backends.oneapi.oneapi_backend import OneAPIBackend from hls4ml.backends.quartus.quartus_backend import QuartusBackend from hls4ml.backends.symbolic.symbolic_backend import SymbolicExpressionBackend @@ -18,3 +19,4 @@ register_backend('Catapult', CatapultBackend) register_backend('SymbolicExpression', SymbolicExpressionBackend) register_backend('oneAPI', OneAPIBackend) +register_backend('Libero', LiberoBackend) diff --git a/hls4ml/backends/libero/__init__.py b/hls4ml/backends/libero/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hls4ml/backends/libero/libero_backend.py b/hls4ml/backends/libero/libero_backend.py new file mode 100644 index 0000000000..a17d40af12 --- /dev/null +++ b/hls4ml/backends/libero/libero_backend.py @@ -0,0 +1,192 @@ +import os +import subprocess +import sys + +from hls4ml.backends import FPGABackend +from hls4ml.model.attributes import ChoiceAttribute +from hls4ml.model.flow import register_flow +from hls4ml.model.layers import Dense, Layer +from hls4ml.model.optimizer import layer_optimizer +from hls4ml.report import parse_libero_report + + +class LiberoBackend(FPGABackend): + def __init__(self): + super().__init__(name='Libero') + self._register_layer_attributes() + self._register_flows() + + def _register_layer_attributes(self): + strategy_layers = [ + Dense, + ] + + for layer in strategy_layers: + attrs = self.attribute_map.get(layer, []) + attrs.append( + ChoiceAttribute( + 'strategy', + choices=['Latency', 'Resource'], + default='Latency', + ) + ) + self.attribute_map[layer] = attrs + + def _register_flows(self): + initializers = self._get_layer_initializers() + init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name) + + libero_types = [ + 'libero:transform_types', + 'libero:set_pipeline_style', + ] + libero_types_flow = register_flow('specific_types', libero_types, requires=[init_flow], backend=self.name) + + template_flow = register_flow('apply_templates', self._get_layer_templates, requires=[init_flow], backend=self.name) + + writer_passes = ['make_stamp', 'libero:write_hls'] + self._writer_flow = register_flow('write', writer_passes, requires=['libero:ip'], backend=self.name) + + ip_flow_requirements = [ + 'optimize', + init_flow, + libero_types_flow, + template_flow, + ] + + self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name) + + def get_default_flow(self): + return self._default_flow + + def get_writer_flow(self): + return self._writer_flow + + def create_initial_config( + self, + fpga_family='PolarFire', + part='MPF300', + board='hw_only', + clock_period=5, + clock_uncertainty='27%', + io_type='io_parallel', + namespace=None, + write_weights_txt=True, + write_tar=False, + **_, + ): + """Create initial configuration of the Libero backend. + + Args: + part (str, optional): The FPGA part to be used. Defaults to 'MPF300'. + clock_period (int, optional): The clock period. Defaults to 5. + clock_uncertainty (str, optional): The clock uncertainty. Defaults to 27%. + io_type (str, optional): Type of implementation used. One of + 'io_parallel' or 'io_stream'. Defaults to 'io_parallel'. + namespace (str, optional): If defined, place all generated code within a namespace. Defaults to None. + write_weights_txt (bool, optional): If True, writes weights to .txt files which speeds up compilation. + Defaults to True. + write_tar (bool, optional): If True, compresses the output directory into a .tar.gz file. Defaults to False. + + Returns: + dict: initial configuration. + """ + config = {} + + config['FPGAFamily'] = fpga_family if fpga_family is not None else 'PolarFire' + config['Part'] = part if part is not None else 'MPF300' + config['Board'] = board if board is not None else 'hw_only' + config['ClockPeriod'] = clock_period if clock_period is not None else 5 + config['IOType'] = io_type if io_type is not None else 'io_parallel' + config['HLSConfig'] = {} + config['WriterConfig'] = { + 'Namespace': namespace, + 'WriteWeightsTxt': write_weights_txt, + 'WriteTar': write_tar, + } + + return config + + def build( + self, + model, + reset=False, + skip_preqs=False, + sw_compile=True, + hw=True, + cosim=False, + rtl_synth=False, + fpga=False, + **kwargs, + ): + """Build the model using Libero suite and SmartHLS compiler. Additional arguments passed to the function in form of + `=True` will be passed as an argument to the `shls` command. See SmartHLS user guide for list of possible + command line options. + + Args: + model (ModelGraph): Model to build + reset (bool, optional): Clean up any existing files. Defaults to False. + skip_preqs(bool, optional): Skip any prerequisite step that is outdated. Defaults to False. + sw_compile (bool, optional): Compile the generated HLS in software. Defaults to True. + hw (bool, optional): Compile the software to hardware, producing a set of Verilog HDL files. Defaults to True. + cosim (bool, optional): Run co-simulation. Defaults to False. + rtl_synth (bool, optional): Run RTL synthesis for resource results. This will take less time than `fpga`. + Defaults to False. + fpga (bool, optional): Synthesize the generated hardware to target FPGA. This runs RTL synthesis and + place-and-route for resource and timing results. Defaults to False. + + Raises: + Exception: Raised if the `shls` command has not been found + CalledProcessError: Raised if SmartHLS returns non-zero code for any of the commands executed + + Returns: + dict: Detailed report produced by SmartHLS. + """ + if 'linux' in sys.platform: + found = os.system('command -v shls > /dev/null') + if found != 0: + raise Exception('Libero/SmartHLS installation not found. Make sure "shls" is on PATH.') + + def run_shls_cmd(cmd_name): + subprocess.run( + ['shls', '-s', cmd_name], + shell=False, + check=True, + stdout=sys.stdout, + stderr=sys.stderr, + cwd=model.config.get_output_dir(), + ) + + if reset: + run_shls_cmd('clean') + if sw_compile: + run_shls_cmd('sw_compile') + if hw: + run_shls_cmd('hw') + if cosim: + run_shls_cmd('cosim') + if rtl_synth: + run_shls_cmd('rtl_synth') + if fpga: + run_shls_cmd('fpga') + + for arg_name, arg_val in kwargs.items(): + if arg_val: + run_shls_cmd(arg_name) + + return parse_libero_report(model.config.get_output_dir()) + + @layer_optimizer(Layer) + def init_base_layer(self, layer): + reuse_factor = layer.model.config.get_reuse_factor(layer) + layer.set_attr('reuse_factor', reuse_factor) + + @layer_optimizer(Dense) + def init_dense(self, layer): + if layer.model.config.is_resource_strategy(layer): + n_in, n_out = self.get_layer_mult_size(layer) + self.set_target_reuse_factor(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('strategy', 'resource') + else: + layer.set_attr('strategy', 'latency') diff --git a/hls4ml/backends/libero/libero_types.py b/hls4ml/backends/libero/libero_types.py new file mode 100644 index 0000000000..9ce0364de8 --- /dev/null +++ b/hls4ml/backends/libero/libero_types.py @@ -0,0 +1,125 @@ +from hls4ml.backends.fpga.fpga_types import ( + ArrayVariableConverter, + ExponentPrecisionType, + FixedPrecisionConverter, + FixedPrecisionType, + InplaceStreamVariableConverter, + IntegerPrecisionType, + PrecisionDefinition, + StreamVariableConverter, + VariableDefinition, + XnorPrecisionType, +) + +# region ArrayVariable + + +class LiberoArrayVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + return '{type} {name}{suffix}[{shape}]'.format( + type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp() + ) + + +class LiberoInplaceArrayVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + +class LiberoArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Libero', definition_cls=LiberoArrayVariableDefinition) + + +class LiberoInplaceArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Libero', definition_cls=LiberoInplaceArrayVariableDefinition) + + +# endregion + +# region StreamVariable + + +class LiberoStreamVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + if as_reference: # Function parameter + return f'hls::FIFO<{self.type.name}> &{self.name}{name_suffix}' + else: # Declaration + return 'hls::FIFO<{type}> {name}{suffix}({depth})'.format( + type=self.type.name, name=self.name, depth=self.pragma[1], suffix=name_suffix + ) + + +class LiberoInplaceStreamVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + +class LiberoStreamVariableConverter(StreamVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Libero', definition_cls=LiberoStreamVariableDefinition) + + +# endregion + +# region InplaceStreamVariable + + +class LiberoInplaceStreamVariableConverter(InplaceStreamVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Libero', definition_cls=LiberoInplaceStreamVariableDefinition + ) + + +# endregion + +# region Precision types + + +class LAPIntegerPrecisionDefinition(PrecisionDefinition): + def definition_cpp(self): + typestring = 'hls::ap_{signed}int<{width}>'.format(signed='u' if not self.signed else '', width=self.width) + return typestring + + +class LAPFixedPrecisionDefinition(PrecisionDefinition): + def _rounding_mode_cpp(self, mode): + if mode is not None: + return 'AP_' + str(mode) + + def _saturation_mode_cpp(self, mode): + if mode is not None: + return 'AP_' + str(mode) + + def definition_cpp(self): + args = [ + self.width, + self.integer, + self._rounding_mode_cpp(self.rounding_mode), + self._saturation_mode_cpp(self.saturation_mode), + ] + if args[2] == 'AP_TRN' and args[3] == 'AP_WRAP': + # This is the default, so we won't write the full definition for brevity + args[2] = args[3] = None + + args = ','.join([str(arg) for arg in args if arg is not None]) + typestring = 'hls::ap_{signed}fixpt<{args}>'.format(signed='u' if not self.signed else '', args=args) + return typestring + + +class LAPTypeConverter(FixedPrecisionConverter): + def __init__(self): + super().__init__( + type_map={ + FixedPrecisionType: LAPFixedPrecisionDefinition, + IntegerPrecisionType: LAPIntegerPrecisionDefinition, + ExponentPrecisionType: LAPIntegerPrecisionDefinition, + XnorPrecisionType: LAPIntegerPrecisionDefinition, + }, + prefix='LAP', + ) + + +# endregion diff --git a/hls4ml/backends/libero/passes/core_templates.py b/hls4ml/backends/libero/passes/core_templates.py new file mode 100644 index 0000000000..c3ac18d17c --- /dev/null +++ b/hls4ml/backends/libero/passes/core_templates.py @@ -0,0 +1,152 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Activation, BatchNormalization, Dense + +# Dense templates + +dense_config_template = """struct config{index} : nnet::dense_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned strategy = nnet::{strategy}; + static const unsigned reuse_factor = {reuse}; + static const unsigned n_zeros = {nzeros}; + static const unsigned n_nonzeros = {nonzeros}; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor; + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + template + using product = nnet::product::{product_type}; +}};\n""" + +dense_function_template = 'nnet::{dense_function}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' + +dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h'] + + +class DenseConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Dense) + self.template = dense_config_template + + def format(self, node): + params = self._default_config_params(node) + params['nzeros'] = node.get_weights('weight').nzeros + params['nonzeros'] = node.get_weights('weight').nonzeros + params['product_type'] = get_backend('libero').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) + + return self.template.format(**params) + + +class DenseFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Dense, include_header=dense_include_list) + self.template = dense_function_template + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + if node.get_attr('strategy').lower() == 'latency': + params['dense_function'] = 'dense_latency' + elif node.get_attr('strategy').lower() == 'resource': + if int(params['reuse_factor']) <= int(params['n_in']): + params['dense_function'] = 'dense_resource_rf_leq_nin' + else: + params['dense_function'] = 'dense_resource_rf_gt_nin_rem0' + # The 3rd case is never used + + return self.template.format(**params) + + +# BatchNormalization templates + +batchnorm_config_template = """struct config{index} : nnet::batchnorm_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_filt = {n_filt}; + static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in, reuse_factor); + static const bool store_weights_in_bram = false; + typedef {bias_t.name} bias_t; + typedef {scale_t.name} scale_t; + template + using product = nnet::product::{product_type}; +}};\n""" + +batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});' + +batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h'] + + +class BatchNormalizationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(BatchNormalization) + self.template = batchnorm_config_template + + def format(self, node): + params = self._default_config_params(node) + params['n_in'] = node.get_input_variable().size_cpp() + params['product_type'] = get_backend('libero').product_type( + node.get_input_variable().type.precision, node.get_weights('scale').type.precision + ) + + return self.template.format(**params) + + +class BatchNormalizationFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(BatchNormalization, include_header=batchnorm_include_list) + self.template = batchnorm_function_template + + def format(self, node): + params = self._default_function_params(node) + params['scale'] = node.get_weights('scale').name + params['bias'] = node.get_weights('bias').name + + return self.template.format(**params) + + +# Activation templates + +activ_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static const unsigned n_in = {n_in}; + static const unsigned table_size = {table_size}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + typedef {table_t.name} table_t; +}};\n""" + +activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});' + +activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h'] + + +class ActivationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Activation) + self.template = activ_config_template + + def format(self, node): + params = self._default_config_params(node) + params['type'] = node.get_attr('activation') + + return self.template.format(**params) + + +class ActivationFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((Activation), include_header=activ_include_list) + self.template = activ_function_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node.get_attr('activation').lower() + params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index) + + return self.template.format(**params) diff --git a/hls4ml/backends/libero/passes/pipeline_style.py b/hls4ml/backends/libero/passes/pipeline_style.py new file mode 100644 index 0000000000..4d4ce38f55 --- /dev/null +++ b/hls4ml/backends/libero/passes/pipeline_style.py @@ -0,0 +1,101 @@ +from hls4ml.model.layers import Conv1D, Conv2D +from hls4ml.model.optimizer import ModelOptimizerPass + + +class SetPipelineStyle(ModelOptimizerPass): + def __init__(self): + pass + + def transform(self, model): + if model.config.pipeline_style not in ['auto', 'pipeline', 'dataflow']: + print( + f'WARNING: Pipeline style set to {model.config.pipeline_style}, valid values: auto, pipeline, dataflow. ' + 'Using "auto".' + ) + self._set_pipeline_style(model, 'auto') + + if model.config.pipeline_style is None or model.config.pipeline_style == 'auto': + + if self._maybe_set_dataflow_io_stream(model): + return True + + if self._maybe_set_dataflow_conv_layers(model): + return True + + if self._maybe_set_dataflow_resource_strategy(model): + return True + + if self._maybe_set_pipeline_io_parallel(model): + return True + + self._set_safe_default_dataflow(model) + return True + else: + self._validate_hls_config(model) + + return False # No model changes made + + def _set_pipeline_style(self, model, pipeline_style): + # Could add logging here + model.config.pipeline_style = pipeline_style + + def _maybe_set_dataflow_io_stream(self, model): + if model.config.get_config_value('IOType') == 'io_stream': + self._set_pipeline_style(model, 'dataflow') + return True + + return False + + def _maybe_set_dataflow_conv_layers(self, model): + for layer in model.get_layers(): + if isinstance(layer, (Conv1D, Conv2D)): + self._set_pipeline_style(model, 'dataflow') + return True + + return False + + def _maybe_set_dataflow_resource_strategy(self, model): + for layer in model.get_layers(): + if model.config.is_resource_strategy(layer): + self._set_pipeline_style(model, 'dataflow') + return True + + return False + + def _maybe_set_pipeline_io_parallel(self, model): + if model.config.get_config_value('IOType') == 'io_parallel': + self._set_pipeline_style(model, 'pipeline') + return True + + return False + + def _set_safe_default_dataflow(self, model): + print( + 'WARNING: Couldn\'t determine best pipeline style, defaulting to "DATAFLOW". ' + 'Use "PipelineStyle" property to override.' + ) + self._set_pipeline_style(model, 'dataflow') + + def _validate_hls_config(self, model): + if model.config.pipeline_style.lower() == 'pipeline': + if model.config.model_strategy.lower() == 'resource': + print( + 'WARNING: Model strategy "Resource" will lead to bad QoR in combination ' + 'with pipeline style set to "pipeline".' + ) + if any(isinstance(layer, (Conv1D, Conv2D)) for layer in model.get_layers()): + print('WARNING: Convolution layers require "dataflow" pipeline style.') + for layer_type, strategy in model.config.layer_type_strategy.items(): + if strategy.lower() == 'resource' and model.config.pipeline_style.lower() == 'pipeline': + print( + f'WARNING: Strategy for layer type {layer_type} set to "Resource", while pipeline style set to ' + '"pipeline". This will lead to bad QoR.' + ) + + for layer_name, strategy in model.config.layer_name_strategy.items(): + if strategy.lower() == 'resource' and model.config.pipeline_style.lower() == 'pipeline': + print( + 'WARNING: Strategy for layer {} set to "Resource", while pipeline style set to "pipeline".'.format( + layer_name + ) + ) diff --git a/hls4ml/backends/libero/passes/transform_types.py b/hls4ml/backends/libero/passes/transform_types.py new file mode 100644 index 0000000000..7271e24c73 --- /dev/null +++ b/hls4ml/backends/libero/passes/transform_types.py @@ -0,0 +1,47 @@ +from hls4ml.backends.fpga.fpga_types import HLSTypeConverter, StaticWeightVariableConverter +from hls4ml.backends.libero.libero_types import ( + LAPTypeConverter, + LiberoArrayVariableConverter, + LiberoInplaceArrayVariableConverter, + LiberoInplaceStreamVariableConverter, + LiberoStreamVariableConverter, +) +from hls4ml.model.optimizer import GlobalOptimizerPass +from hls4ml.model.types import InplaceTensorVariable + + +class TransformTypes(GlobalOptimizerPass): + def __init__(self): + self.type_converter = HLSTypeConverter(precision_converter=LAPTypeConverter()) + self.array_var_converter = LiberoArrayVariableConverter(type_converter=self.type_converter) + self.inplace_array_var_converter = LiberoInplaceArrayVariableConverter(type_converter=self.type_converter) + self.stream_var_converter = LiberoStreamVariableConverter(type_converter=self.type_converter) + self.inplace_stream_var_converter = LiberoInplaceStreamVariableConverter(type_converter=self.type_converter) + self.weight_var_converter = StaticWeightVariableConverter(type_converter=self.type_converter) + + def transform(self, model, node): + io_type = node.model.config.get_config_value('IOType') + + for out_name, var in node.variables.items(): + if io_type == 'io_stream': + if isinstance(var, InplaceTensorVariable): + new_var = self.inplace_stream_var_converter.convert(var) + else: + new_var = self.stream_var_converter.convert(var) + elif io_type == 'io_parallel': + if isinstance(var, InplaceTensorVariable): + new_var = self.inplace_array_var_converter.convert(var, pragma='') + else: + new_var = self.array_var_converter.convert(var, pragma='partition') + else: + raise Exception(f'Unknown IOType {io_type} in {node.name} ({node.class_name})') + + node.set_attr(out_name, new_var) + + for w_name, weight in node.weights.items(): + new_weight = self.weight_var_converter.convert(weight) + node.set_attr(w_name, new_weight) + + for t_name, type in node.types.items(): + new_type = self.type_converter.convert(type) + node.set_attr(t_name, new_type) diff --git a/hls4ml/report/__init__.py b/hls4ml/report/__init__.py index 3c9b7707b7..c24207eda0 100644 --- a/hls4ml/report/__init__.py +++ b/hls4ml/report/__init__.py @@ -1,6 +1,7 @@ from hls4ml.report.catapult_report import parse_catapult_report # noqa: F401 from hls4ml.report.catapult_report import qofr # noqa: F401 from hls4ml.report.catapult_report import read_catapult_report # noqa: F401 +from hls4ml.report.libero_report import parse_libero_report # noqa: F401 from hls4ml.report.quartus_report import parse_quartus_report # noqa: F401 from hls4ml.report.quartus_report import read_quartus_report # noqa: F401 from hls4ml.report.vivado_report import parse_vivado_report # noqa: F401 diff --git a/hls4ml/report/libero_report.py b/hls4ml/report/libero_report.py new file mode 100644 index 0000000000..4591d8f338 --- /dev/null +++ b/hls4ml/report/libero_report.py @@ -0,0 +1,117 @@ +from pathlib import Path + + +def parse_libero_report(out_dir): + """Reads and parses an FPGA synthesis report into a structured dictionary.""" + + out_path = Path(out_dir) + report_path = out_path / 'hls_output/reports/summary.results.rpt' + if not report_path.exists(): + print(f'Libero report file {str(report_path)} not found.') + return {} + + with open(report_path) as file: + report_lines = file.readlines() + + return { + 'Simulation Result': _parse_sim_data(_extract_section(report_lines, '====== 1. Simulation Result')), + 'Timing Result': _parse_timing_data(_extract_section(report_lines, '====== 2. Timing Result')), + 'Resource Usage': parse_utilization_data(_extract_section(report_lines, '====== 3. Resource Usage')), + } + + +def _extract_section(lines, section_header): + """Extracts table data from a given section in the report.""" + + section_data = [] + in_section = False + + for line in lines: + if section_header in line: + in_section = True + continue + + if in_section: + if line.startswith('======'): # Start of next section + break + section_data.append(line.strip()) + + return section_data + + +def _parse_sim_data(data): + """Parses 'Simulation Result' section.""" + if len(data) == 0: + return {'Error': 'Data missing for this section'} + + sim_dict = {} + + for line in data: + if line.startswith('N/A. Please run'): + return {'Error': line} + elif line.startswith('+') or line.startswith('| Top-Level Name'): + continue # Ignore table borders + elif '|' in line: + columns = [col.strip() for col in line.split('|')[2:-1]] + sim_dict.update( + { + 'Number of calls': columns[0], + 'Simulation time (cycles)': columns[1], + 'Call Latency (min/max/avg)': columns[2], + 'Call II (min/max/avg)': columns[3], + } + ) + elif 'SW/HW co-simulation' in line: + sim_dict['Status'] = line.split(':')[1].strip() + + return sim_dict + + +def _parse_timing_data(data): + """Parses 'Timing Result' section.""" + if len(data) == 0: + return {'Error': 'Data missing for this section'} + + timing_dict = {} + + for line in data: + if line.startswith('N/A. Please run'): + return {'Error': line} + elif line.startswith('+') or line.startswith('| Clock Domain'): + continue # Ignore table borders + elif '|' in line: + columns = [col.strip() for col in line.split('|')[2:-1]] + timing_dict.update( + { + 'Target Period': columns[0], + 'Target Fmax': columns[1], + 'Worst Slack': columns[2], + 'Period': columns[3], + 'Fmax': columns[4], + } + ) + + return timing_dict + + +def parse_utilization_data(data): + """Parses 'Resource Usage' section.""" + if len(data) == 0: + return {'Error': 'Data missing for this section'} + + util_dict = {} + + for line in data: + if line.startswith('N/A. Please run'): + return {'Error': line} + elif line.startswith('+') or line.startswith('| Resource Type'): + continue # Ignore table borders + elif '|' in line: + columns = [col.strip() for col in line.split('|')[1:-1]] + util_dict[columns[0]] = { + 'Used': columns[1], + 'Total': columns[2], + 'Percentage': columns[3], + } + + return util_dict diff --git a/hls4ml/templates/libero/build_lib.sh b/hls4ml/templates/libero/build_lib.sh new file mode 100644 index 0000000000..a0860a462e --- /dev/null +++ b/hls4ml/templates/libero/build_lib.sh @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +CC=g++ +if [[ "$OSTYPE" == "linux-gnu" ]]; then + CFLAGS="-O3 -fPIC -fno-gnu-unique" +elif [[ "$OSTYPE" == "darwin"* ]]; then + CFLAGS="-O3 -fPIC" +fi +LDFLAGS= +INCFLAGS="-I/opt/microchip/Libero_SoC_v2024.2/SmartHLS-2024.2/SmartHLS/smarthls-library/hls" +PROJECT=myproject +LIB_STAMP=mystamp +BASEDIR="$(cd "$(dirname "$0")" && pwd)" +WEIGHTS_DIR="\"${BASEDIR}/firmware/weights\"" + +${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c firmware/${PROJECT}.cpp -o ${PROJECT}.o +${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o +${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so +rm -f *.o diff --git a/hls4ml/templates/libero/firmware/defines.h b/hls4ml/templates/libero/firmware/defines.h new file mode 100644 index 0000000000..fcdf1e080d --- /dev/null +++ b/hls4ml/templates/libero/firmware/defines.h @@ -0,0 +1,19 @@ +#ifndef DEFINES_H_ +#define DEFINES_H_ + +#include "nnet_utils/nnet_types.h" +#include +#include +#include +#include +#include + +// hls-fpga-machine-learning insert numbers + +// hls-fpga-machine-learning insert namespace-start + +// hls-fpga-machine-learning insert layer-precision + +// hls-fpga-machine-learning insert namespace-end + +#endif diff --git a/hls4ml/templates/libero/firmware/myproject.cpp b/hls4ml/templates/libero/firmware/myproject.cpp new file mode 100644 index 0000000000..c231adb47a --- /dev/null +++ b/hls4ml/templates/libero/firmware/myproject.cpp @@ -0,0 +1,19 @@ +#include + +#include "myproject.h" +#include "parameters.h" + +// hls-fpga-machine-learning insert namespace-start + +void myproject( + // hls-fpga-machine-learning insert header +) { + #pragma HLS function top + // hls-fpga-machine-learning insert IO + + // hls-fpga-machine-learning insert load weights + + // hls-fpga-machine-learning insert layers +} + +// hls-fpga-machine-learning insert namespace-end diff --git a/hls4ml/templates/libero/firmware/myproject.h b/hls4ml/templates/libero/firmware/myproject.h new file mode 100644 index 0000000000..0412ddc2d8 --- /dev/null +++ b/hls4ml/templates/libero/firmware/myproject.h @@ -0,0 +1,19 @@ +#ifndef MYPROJECT_H_ +#define MYPROJECT_H_ + +#include +#include +#include + +#include "defines.h" + +// hls-fpga-machine-learning insert namespace-start + +// Prototype of top level function for C-synthesis +void myproject( + // hls-fpga-machine-learning insert header +); + +// hls-fpga-machine-learning insert namespace-end + +#endif diff --git a/hls4ml/templates/libero/firmware/parameters.h b/hls4ml/templates/libero/firmware/parameters.h new file mode 100644 index 0000000000..feee517633 --- /dev/null +++ b/hls4ml/templates/libero/firmware/parameters.h @@ -0,0 +1,19 @@ +#ifndef PARAMETERS_H_ +#define PARAMETERS_H_ + +#include +#include + +#include "nnet_utils/nnet_code_gen.h" +#include "nnet_utils/nnet_helpers.h" +// hls-fpga-machine-learning insert includes + +// hls-fpga-machine-learning insert weights + +// hls-fpga-machine-learning insert namespace-start + +// hls-fpga-machine-learning insert layer-config + +// hls-fpga-machine-learning insert namespace-end + +#endif diff --git a/hls4ml/templates/libero/myproject_bridge.cpp b/hls4ml/templates/libero/myproject_bridge.cpp new file mode 100644 index 0000000000..b1822a5ff6 --- /dev/null +++ b/hls4ml/templates/libero/myproject_bridge.cpp @@ -0,0 +1,69 @@ +#ifndef MYPROJECT_BRIDGE_H_ +#define MYPROJECT_BRIDGE_H_ + +#include "firmware/myproject.h" +#include "firmware/nnet_utils/nnet_helpers.h" +#include +#include + +// hls-fpga-machine-learning insert bram + +namespace nnet { +bool trace_enabled = false; +std::map *trace_outputs = NULL; +size_t trace_type_size = sizeof(double); +} // namespace nnet + +extern "C" { + +struct trace_data { + const char *name; + void *data; +}; + +void allocate_trace_storage(size_t element_size) { + nnet::trace_enabled = true; + nnet::trace_outputs = new std::map; + nnet::trace_type_size = element_size; + // hls-fpga-machine-learning insert trace_outputs +} + +void free_trace_storage() { + for (std::map::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) { + void *ptr = i->second; + free(ptr); + } + nnet::trace_outputs->clear(); + delete nnet::trace_outputs; + nnet::trace_outputs = NULL; + nnet::trace_enabled = false; +} + +void collect_trace_output(struct trace_data *c_trace_outputs) { + int ii = 0; + for (std::map::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) { + c_trace_outputs[ii].name = i->first.c_str(); + c_trace_outputs[ii].data = i->second; + ii++; + } +} + +// Wrapper of top level function for Python bridge +void myproject_float( + // hls-fpga-machine-learning insert header #float +) { + // hls-fpga-machine-learning insert namespace + + // hls-fpga-machine-learning insert wrapper #float +} + +void myproject_double( + // hls-fpga-machine-learning insert header #double +) { + // hls-fpga-machine-learning insert namespace + + // hls-fpga-machine-learning insert wrapper #double +} +} + +#endif diff --git a/hls4ml/templates/libero/myproject_test.cpp b/hls4ml/templates/libero/myproject_test.cpp new file mode 100644 index 0000000000..2dc963906e --- /dev/null +++ b/hls4ml/templates/libero/myproject_test.cpp @@ -0,0 +1,96 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "firmware/myproject.h" +#include "firmware/nnet_utils/nnet_helpers.h" + +// hls-fpga-machine-learning insert bram + +#define CHECKPOINT 5000 + +namespace nnet { +bool trace_enabled = true; +std::map *trace_outputs = NULL; +size_t trace_type_size = sizeof(double); +} // namespace nnet + +int main(int argc, char **argv) { + // hls-fpga-machine-learning insert namespace + + // load input data from text file + std::ifstream fin("tb_data/tb_input_features.dat"); + // load predictions from text file + std::ifstream fpr("tb_data/tb_output_predictions.dat"); + +#ifdef RTL_SIM + std::string RESULTS_LOG = "tb_data/rtl_cosim_results.log"; +#else + std::string RESULTS_LOG = "tb_data/csim_results.log"; +#endif + std::ofstream fout(RESULTS_LOG); + + std::string iline; + std::string pline; + int e = 0; + + if (fin.is_open() && fpr.is_open()) { + while (std::getline(fin, iline) && std::getline(fpr, pline)) { + if (e % CHECKPOINT == 0) + std::cout << "Processing input " << e << std::endl; + char *cstr = const_cast(iline.c_str()); + char *current; + std::vector in; + current = strtok(cstr, " "); + while (current != NULL) { + in.push_back(atof(current)); + current = strtok(NULL, " "); + } + cstr = const_cast(pline.c_str()); + std::vector pr; + current = strtok(cstr, " "); + while (current != NULL) { + pr.push_back(atof(current)); + current = strtok(NULL, " "); + } + + // hls-fpga-machine-learning insert data + + // hls-fpga-machine-learning insert top-level-function + + if (e % CHECKPOINT == 0) { + std::cout << "Predictions" << std::endl; + // hls-fpga-machine-learning insert predictions + std::cout << "Quantized predictions" << std::endl; + // hls-fpga-machine-learning insert quantized + } + e++; + + // hls-fpga-machine-learning insert tb-output + } + fin.close(); + fpr.close(); + } else { + std::cout << "INFO: Unable to open input/predictions file, using default input." << std::endl; + const unsigned NUM_TEST_SAMPLES = 5; + for (unsigned i = 0; i < NUM_TEST_SAMPLES; i++) { + // hls-fpga-machine-learning insert zero + + // hls-fpga-machine-learning insert top-level-function + + // hls-fpga-machine-learning insert output + + // hls-fpga-machine-learning insert tb-output + } + } + + fout.close(); + std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl; + + return 0; +} diff --git a/hls4ml/templates/libero/nnet_utils/nnet_activation.h b/hls4ml/templates/libero/nnet_utils/nnet_activation.h new file mode 100644 index 0000000000..014cd9570d --- /dev/null +++ b/hls4ml/templates/libero/nnet_utils/nnet_activation.h @@ -0,0 +1,800 @@ +#ifndef NNET_ACTIVATION_H_ +#define NNET_ACTIVATION_H_ + +#include "nnet_common.h" +#include +#include +#include + +namespace nnet { + +struct activ_config { + // IO size + static const unsigned n_in = 10; + + // Internal info + static const unsigned table_size = 1024; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + + // Internal data type definitions + typedef hls::ap_fixpt<18, 8> table_t; +}; + +// ************************************************* +// LINEAR Activation -- See Issue 53 +// ************************************************* +template void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma HLS loop pipeline + + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + res[ii] = data[ii]; + } +} + +// ************************************************* +// RELU Activation +// ************************************************* +template void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + + data_T datareg; + #pragma HLS loop pipeline + for (uint ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = 0; + } +} + +template +void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + + data_T datareg; + #pragma HLS loop pipeline + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg < 0) + res[ii] = 0; + else if (datareg > MAX_INT) + res[ii] = MAX_INT; + else + res[ii] = datareg; + } +} + +template void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + relu_max(data, res); +} + +template void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + relu_max(data, res); +} + +// ************************************************* +// Sigmoid Activation +// ************************************************* +inline float sigmoid_fcn_float(float input) { return 1.0 / (1 + std::exp(-input)); } + +template void init_sigmoid_table(typename CONFIG_T::table_t table_out[N_TABLE]) { + // Default logistic sigmoid function: + // result = 1/(1+e^(-x)) + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to +8) + float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = sigmoid_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; + } +} + +template +void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_sigmoid_table(sigmoid_table); + initialized = true; + } + + // Index into the lookup table based on data + int data_round; + int index; + #pragma HLS loop pipeline + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round = data[ii] * CONFIG_T::table_size / 16; + index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = (res_T)sigmoid_table[index]; + } +} + +// ************************************************* +// Softmax Activation +// ************************************************* + +enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 }; + +inline float exp_fcn_float(float input) { return std::exp(input); } + +template inline float softmax_real_val_from_idx(unsigned i) { + // Treat the index as the top N bits + static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table + data_T x(0); + x(x.width - 1, x.width - N) = i; + return (float)x; +} + +template inline unsigned softmax_idx_from_real_val(data_T x) { + // Slice the top N bits to get an index into the table + static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table + hls::ap_uint y = x(x.width - 1, x.width - N); // slice the top N bits of input + return (unsigned)y(N - 1, 0); +} + +template +void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::table_size]) { + // The template data_T is the data type used to address the table + for (unsigned i = 0; i < CONFIG_T::table_size; i++) { + // Slicing bits for address is going to round towards 0, so take the central value + float x = softmax_real_val_from_idx(i); + typename CONFIG_T::exp_table_t exp_x = exp_fcn_float(x); + table_out[i] = exp_x; + } +} + +template +void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_size]) { + // The template data_T is the data type used to address the table + for (unsigned i = 0; i < CONFIG_T::table_size; i++) { + float x = softmax_real_val_from_idx(i); + typename CONFIG_T::inv_table_t inv_x = 1 / x; + table_out[i] = inv_x; + } +} + +template +void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma HLS function pipeline + // Initialize the lookup tables +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; + +#endif + if (!initialized) { + // Note we are exponentiating the inputs, which have type data_T + init_exp_table(exp_table); + // Note we are inverting the exponentials, which have type exp_table_t + init_invert_table(invert_table); + initialized = true; + } + + // Calculate all the e^x's + #pragma HLS memory partition variable(exp_res) type(complete) + typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; + typename CONFIG_T::exp_table_t exp_sum(0); + #pragma HLS loop unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + unsigned x = softmax_idx_from_real_val(data[i]); + exp_res[i] = exp_table[x]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + exp_sum = + reduce>(exp_res, op_add); + + typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_idx_from_real_val(exp_sum)]; + #pragma HLS loop unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + res[i] = exp_res[i] * inv_exp_sum; + } +} + +template +void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma HLS function pipeline + // Initialize the lookup tables +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; + +#endif + if (!initialized) { + // Note we are exponentiating the inputs, which have type data_T + init_exp_table(exp_table); + // Note we are inverting the exponentials, which have type exp_table_t + init_invert_table(invert_table); + initialized = true; + } + + // Find the max and compute all delta(x_i, x_max) + Op_max op_max; + data_T x_max = reduce>(data, op_max); + + // For the diffs, use the same type as the input but force rounding and saturation + hls::ap_fixpt d_xi_xmax[CONFIG_T::n_in]; + #pragma HLS loop unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + d_xi_xmax[i] = data[i] - x_max; + } + + // Calculate all the e^x's + #pragma HLS memory partition variable(exp_res) type(complete) + typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; + typename CONFIG_T::exp_table_t exp_sum(0); + #pragma HLS loop unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + unsigned x = softmax_idx_from_real_val(d_xi_xmax[i]); + exp_res[i] = exp_table[x]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + exp_sum = + reduce>(exp_res, op_add); + + typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_idx_from_real_val(exp_sum)]; + #pragma HLS loop unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + res[i] = exp_res[i] * inv_exp_sum; + } +} + +template void init_exp_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) { + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to +8) + float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = exp_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; + } +} + +template void init_invert_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) { + // Inversion function: + // result = 1/x + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range 0 to +64) + float in_val = 64.0 * ii / float(N_TABLE); + // Next, compute lookup table function + if (in_val > 0.0) + table_out[ii] = 1.0 / in_val; + else + table_out[ii] = 0.0; + } +} + +template +void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_exp_table_legacy(exp_table); + init_invert_table_legacy(invert_table); + initialized = true; + } + + // Index into the lookup table based on data for exponentials + typename CONFIG_T::table_t exp_res[CONFIG_T::n_in]; // different, independent, fixed point precision + typename CONFIG_T::table_t exp_diff_res; // different, independent, fixed point precision + data_T data_cache[CONFIG_T::n_in]; + int data_round; + uint index; + #pragma HLS loop pipeline + for (uint ii = 0; ii < CONFIG_T::n_in; ii++) { + data_cache[ii] = data[ii]; + exp_res[ii] = 0; + } + + for (uint ii = 0; ii < CONFIG_T::n_in; ii++) { + for (uint jj = 0; jj < CONFIG_T::n_in; jj++) { + if (ii == jj) + exp_diff_res = 1; + else { + data_round = (data_cache[jj] - data_cache[ii]) * CONFIG_T::table_size / 16; + index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + exp_diff_res = exp_table[index]; + } + exp_res[ii] += exp_diff_res; + } + } + + // Second loop to invert + for (uint ii = 0; ii < CONFIG_T::n_in; ii++) { + uint exp_res_index = exp_res[ii] * CONFIG_T::table_size / 64; + if (exp_res_index < 0) + exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size - 1) + exp_res_index = CONFIG_T::table_size - 1; + // typename CONFIG_T::table_t exp_res_invert = invert_table[exp_res_index]; + res[ii] = (res_T)invert_table[exp_res_index]; + } +} + +template +void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma HLS loop unroll + for (uint i = 0; i < CONFIG_T::n_in; i++) { + res[i] = (res_T)0; + } + + data_T maximum = data[0]; + int idx = 0; + + #pragma HLS loop pipeline + for (uint i = 1; i < CONFIG_T::n_in; i++) { + if (data[i] > maximum) { + maximum = data[i]; + idx = i; + } + } + + res[idx] = (res_T)1; +} + +template +void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma HLS function inline + switch (CONFIG_T::implementation) { + case softmax_implementation::latency: + softmax_latency(data, res); + break; + case softmax_implementation::stable: + softmax_stable(data, res); + break; + case softmax_implementation::legacy: + softmax_legacy(data, res); + break; + case softmax_implementation::argmax: + softmax_argmax(data, res); + break; + } +} + +// ************************************************* +// TanH Activation +// ************************************************* +template void init_tanh_table(typename CONFIG_T::table_t table_out[N_TABLE]) { + // Implement tanh lookup + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -4 to +4) + float in_val = 2 * 4.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = std::tanh(in_val); + // std::cout << "Tanh: Lookup table Index: " << ii<< " In Value: " << in_val << " Result: " << real_val << + // std::endl; + table_out[ii] = real_val; + } +} + +template void tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_tanh_table(tanh_table); + initialized = true; + } + + // Index into the lookup table based on data + int data_round; + int index; + #pragma HLS loop pipeline + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round = data[ii] * CONFIG_T::table_size / 8; + index = data_round + 4 * CONFIG_T::table_size / 8; + // std::cout << "Input: " << data[ii] << " Round: " << data_round << " Index: " << index << std::endl; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = (res_T)tanh_table[index]; + } +} + +// ************************************************* +// UnaryLUT Activation +// ************************************************* +template inline unsigned get_index_unary_lut(data_T x) { + // Slice the top N bits to get an index into the table + static constexpr int N = ceillog2(table_size); + return (unsigned)(x(x.width - 1, 0)); +} + +template +void unary_lut(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in], + typename CONFIG_T::table_t table[CONFIG_T::table_size]) { + #pragma HLS memory partition argument(table) + //#pragma HLS function_instantiate variable=table + + #pragma HLS loop unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + unsigned index = get_index_unary_lut(data[ii]); + res[ii] = (res_T)table[index]; + } +} + +// ************************************************* +// Hard sigmoid Activation +// ************************************************* +template +void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma HLS loop pipeline + + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift; + if (datareg > 1) + datareg = 1; + else if (datareg < 0) + datareg = 0; + res[ii] = datareg; + } +} + +template +void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + if (CONFIG_T::io_type == io_parallel) { + #pragma HLS loop pipeline + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift; + if (sigmoid > 1) + sigmoid = 1; + else if (sigmoid < 0) + sigmoid = 0; + res[ii] = 2 * sigmoid - 1; + } + } else { + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift; + if (sigmoid > 1) + sigmoid = 1; + else if (sigmoid < 0) + sigmoid = 0; + res[ii] = 2 * sigmoid - 1; + } + } +} + +// ************************************************* +// Leaky RELU Activation +// ************************************************* +template +void leaky_relu(data_T data[CONFIG_T::n_in], param_T alpha, res_T res[CONFIG_T::n_in]) { + + data_T datareg; + #pragma HLS loop pipeline + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = alpha * datareg; + } +} + +// ************************************************* +// Thresholded RELU Activation +// ************************************************* +template +void thresholded_relu(data_T data[CONFIG_T::n_in], param_T theta, res_T res[CONFIG_T::n_in]) { + + data_T datareg; + #pragma HLS loop pipeline + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg > theta) + res[ii] = datareg; + else + res[ii] = 0; + } +} + +// ************************************************* +// Softplus Activation +// ************************************************* +inline float softplus_fcn_float(float input) { return std::log(std::exp(input) + 1.); } + +template void init_softplus_table(typename CONFIG_T::table_t table_out[N_TABLE]) { + // Default softplus function: + // result = log(exp(x) + 1) + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to +8) + float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = softplus_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; + } +} + +template +void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_softplus_table(softplus_table); + initialized = true; + } + + // Index into the lookup table based on data + int data_round; + int index; + #pragma HLS loop pipeline + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round = data[ii] * CONFIG_T::table_size / 16; + index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = (res_T)softplus_table[index]; + } +} + +// ************************************************* +// Softsign Activation +// ************************************************* +inline float softsign_fcn_float(float input) { return input / (std::abs(input) + 1.); } + +template void init_softsign_table(typename CONFIG_T::table_t table_out[N_TABLE]) { + // Default softsign function: + // result = x / (abs(x) + 1) + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to +8) + float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = softsign_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; + } +} + +template +void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_softsign_table(softsign_table); + initialized = true; + } + + // Index into the lookup table based on data + int data_round; + int index; + #pragma HLS loop pipeline + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round = data[ii] * CONFIG_T::table_size / 16; + index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = (res_T)softsign_table[index]; + } +} + +// ************************************************* +// ELU Activation +// ************************************************* +inline float elu_fcn_float(float input) { return std::exp(input) - 1.; } + +template void init_elu_table(typename CONFIG_T::table_t table_out[N_TABLE]) { + // Default ELU function: + // result = alpha * (e^(x) - 1) + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to 0) + float in_val = -8.0 * ii / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = elu_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; + } +} + +template +void elu(data_T data[CONFIG_T::n_in], const param_T alpha, res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t elu_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t elu_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_elu_table(elu_table); + initialized = true; + } + + data_T datareg; + // Index into the lookup table based on data + int index; + #pragma HLS loop pipeline + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg >= 0) { + res[ii] = datareg; + } else { + index = datareg * CONFIG_T::table_size / -8; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = alpha * elu_table[index]; + } + } +} + +template void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + elu, res_T, CONFIG_T>(data, 1.0, res); +} + +// ************************************************* +// SELU Activation +// ************************************************* +inline float selu_fcn_float(float input) { + return 1.0507009873554804934193349852946 * (1.6732632423543772848170429916717 * (std::exp(input) - 1.)); +} + +template void init_selu_table(typename CONFIG_T::table_t table_out[N_TABLE]) { + // Default SELU function: + // result = 1.05 * (1.673 * (e^(x) - 1)) + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to 0) + float in_val = -8.0 * ii / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = selu_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; + } +} + +template void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t selu_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t selu_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_selu_table(selu_table); + initialized = true; + } + + data_T datareg; + // Index into the lookup table based on data + int index; + #pragma HLS loop pipeline + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg >= 0) { + res[ii] = res_T(1.0507009873554804934193349852946) * datareg; + } else { + index = datareg * CONFIG_T::table_size / -8; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = selu_table[index]; + } + } +} + +// ************************************************* +// PReLU Activation +// ************************************************* +template +void prelu(data_T data[CONFIG_T::n_in], param_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + + data_T datareg; + #pragma HLS loop pipeline + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = alpha[ii] * datareg; + } +} + +// ************************************************* +// Binary TanH Activation +// ************************************************* +template +void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + + data_T datareg; + res_T cache; + #pragma HLS loop pipeline + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg > 0) + cache = 1; + else + cache = -1; + + res[ii] = (res_T)cache; + } +} + +// ************************************************* +// Ternary TanH Activation +// ************************************************* +template +void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + + data_T datareg; + res_T cache; + #pragma HLS loop pipeline + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = 2 * data[ii]; + if (datareg > 1) + cache = 1; + else if (datareg > -1 && datareg <= 1) + cache = 0; + else + cache = -1; + + res[ii] = (res_T)cache; + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/libero/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/libero/nnet_utils/nnet_activation_stream.h new file mode 100644 index 0000000000..7ee7ffa516 --- /dev/null +++ b/hls4ml/templates/libero/nnet_utils/nnet_activation_stream.h @@ -0,0 +1,800 @@ +#ifndef NNET_ACTIVATION_STREAM_H_ +#define NNET_ACTIVATION_STREAM_H_ + +#include "nnet_activation.h" +#include "nnet_common.h" +#include "nnet_stream.h" +#include "nnet_types.h" +#include +#include +#include +#include + +namespace nnet { + +// ************************************************* +// LINEAR Activation +// ************************************************* +template void linear(hls::FIFO &data, hls::FIFO &res) { +LinearActLoop: + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + ////PRAGMA_DATA_PACK(out_data) + + LinearPackLoop: + #pragma HLS loop unroll + for (int j = 0; j < res_T::size; j++) { + out_data[j] = in_data[j]; + } + + res.write(out_data); + } +} + +// ************************************************* +// RELU Activation +// ************************************************* +template void relu(hls::FIFO &data, hls::FIFO &res) { +ReLUActLoop: + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + ////PRAGMA_DATA_PACK(out_data) + + ReLUPackLoop: + #pragma HLS loop unroll + for (int j = 0; j < res_T::size; j++) { + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = 0; + } + + res.write(out_data); + } +} + +// ************************************************* +// Sigmoid Activation +// ************************************************* + +template void sigmoid(hls::FIFO &data, hls::FIFO &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_sigmoid_table(sigmoid_table); + initialized = true; + } + +SigmoidActLoop: + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + ////PRAGMA_DATA_PACK(out_data) + + SigmoidPackLoop: + #pragma HLS loop unroll + for (int j = 0; j < res_T::size; j++) { + int data_round = in_data[j] * CONFIG_T::table_size / 16; + int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = sigmoid_table[index]; + } + + res.write(out_data); + } +} + +// ************************************************* +// Softmax Activation +// ************************************************* + +template +void softmax_latency(hls::FIFO &data, hls::FIFO &res) { + // Initialize the lookup tables +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; + +#endif + if (!initialized) { + // Note we are exponentiating the inputs, which have type data_T + init_exp_table(exp_table); + // Note we are inverting the exponentials, which have type exp_table_t + init_invert_table(invert_table); + initialized = true; + } + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); + constexpr unsigned ii = data_T::size / multiplier_limit; + + // Calculate all the e^x's + #pragma HLS memory partition variable(exp_res) type(complete) + typename CONFIG_T::exp_table_t exp_res[data_T::size]; + typename CONFIG_T::exp_table_t exp_sum(0); +SoftmaxExpLoop: + #pragma HLS loop pipeline II(ii) + for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) { + + data_T in_pack = data.read(); + SoftmaxExpPackLoop: + #pragma HLS loop unroll + for (unsigned j = 0; j < data_T::size; j++) { + unsigned x = softmax_idx_from_real_val(in_pack[j]); + exp_res[j] = exp_table[x]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + exp_sum = + reduce>(exp_res, op_add); + + typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_idx_from_real_val(exp_sum)]; + + res_T out_pack; + // PRAGMA_DATA_PACK(out_pack) + + SoftmaxInvPackLoop: + #pragma HLS loop unroll + for (unsigned j = 0; j < res_T::size; j++) { + //#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit + out_pack[j] = exp_res[j] * inv_exp_sum; + } + res.write(out_pack); + } +} + +template void softmax_stable(hls::FIFO &data, hls::FIFO &res) { + // Initialize the lookup tables +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; + +#endif + if (!initialized) { + // Note we are exponentiating the inputs, which have type data_T + init_exp_table(exp_table); + // Note we are inverting the exponentials, which have type exp_table_t + init_invert_table(invert_table); + initialized = true; + } + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); + constexpr unsigned ii = data_T::size / multiplier_limit; + + #pragma HLS memory partition variable(data_array) type(complete) + typename data_T::value_type data_array[data_T::size]; +SoftmaxArrayLoop: + #pragma HLS loop pipeline II(ii) + for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) { + + data_T in_pack = data.read(); + SoftmaxArrayPackLoop: + #pragma HLS loop unroll + for (unsigned j = 0; j < data_T::size; j++) { + data_array[j] = in_pack[j]; + } + + // Find the max and compute all delta(x_i, x_max) + Op_max op_max; + typename data_T::value_type x_max = + reduce>(data_array, op_max); + + // For the diffs, use the same type as the input but force rounding and saturation + hls::ap_fixpt d_xi_xmax[data_T::size]; + #pragma HLS loop unroll + for (unsigned j = 0; j < data_T::size; j++) { + d_xi_xmax[j] = data_array[j] - x_max; + } + + // Calculate all the e^x's + #pragma HLS memory partition variable(exp_res) type(complete) + typename CONFIG_T::exp_table_t exp_res[data_T::size]; + typename CONFIG_T::exp_table_t exp_sum(0); + #pragma HLS loop unroll + for (unsigned j = 0; j < data_T::size; j++) { + unsigned x = softmax_idx_from_real_val(d_xi_xmax[j]); + exp_res[j] = exp_table[x]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + exp_sum = + reduce>(exp_res, op_add); + + typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_idx_from_real_val(exp_sum)]; + + res_T out_pack; + // PRAGMA_DATA_PACK(out_pack) + + SoftmaxInvPackLoop: + #pragma HLS loop unroll + for (unsigned j = 0; j < res_T::size; j++) { + //#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit + out_pack[j] = exp_res[j] * inv_exp_sum; + } + + res.write(out_pack); + } +} + +template void softmax_legacy(hls::FIFO &data, hls::FIFO &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_exp_table_legacy(exp_table); + init_invert_table_legacy(invert_table); + initialized = true; + } + + // Index into the lookup table based on data for exponentials + typename CONFIG_T::table_t exp_res[data_T::size]; + typename CONFIG_T::table_t exp_diff_res; + typename data_T::value_type data_cache[data_T::size]; + +SoftmaxInitLoop: + #pragma HLS loop pipeline + for (unsigned s = 0; s < CONFIG_T::n_in / data_T::size; s++) { + data_T in_pack = data.read(); + SoftmaxInitPackLoop: + #pragma HLS loop unroll + for (unsigned j = 0; j < data_T::size; j++) { + data_cache[j] = in_pack[j]; + exp_res[j] = 0; + } + + SoftmaxExpLoop: + #pragma HLS loop unroll + for (int i = 0; i < data_T::size; i++) { + SoftmaxExpInner: + #pragma HLS loop unroll + for (int j = 0; j < data_T::size; j++) { + if (i == j) { + exp_diff_res = 1; + } else { + int data_round = (data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16; + int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + exp_diff_res = exp_table[index]; + } + + exp_res[i] += exp_diff_res; + } + } + + res_T out_pack; + // PRAGMA_DATA_PACK(out_pack) + + SoftmaxInvPackLoop: + #pragma HLS loop unroll + for (unsigned j = 0; j < res_T::size; j++) { + + int exp_res_index = exp_res[j] * CONFIG_T::table_size / 64; + if (exp_res_index < 0) + exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size - 1) + exp_res_index = CONFIG_T::table_size - 1; + + out_pack[j] = (typename res_T::value_type)invert_table[exp_res_index]; + } + res.write(out_pack); + } +} + +template void softmax_argmax(hls::FIFO &data, hls::FIFO &res) { + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + data_T in_data = data.read(); + res_T out_data; + + #pragma HLS loop unroll + for (int i = 0; i < res_T::size; i++) { + out_data[i] = (typename res_T::value_type)0; + } + + typename data_T::value_type maximum = in_data[0]; + int idx = 0; + + #pragma HLS loop pipeline + for (int i = 1; i < res_T::size; i++) { + if (in_data[i] > maximum) { + maximum = in_data[i]; + idx = i; + } + } + + out_data[idx] = (typename res_T::value_type)1; + res.write(out_data); + } +} + +template void softmax(hls::FIFO &data, hls::FIFO &res) { + assert(CONFIG_T::axis == -1); + + switch (CONFIG_T::implementation) { + case softmax_implementation::latency: + softmax_latency(data, res); + break; + case softmax_implementation::stable: + softmax_stable(data, res); + break; + case softmax_implementation::legacy: + softmax_legacy(data, res); + break; + case softmax_implementation::argmax: + softmax_argmax(data, res); + break; + } +} + +// ************************************************* +// TanH Activation +// ************************************************* + +template void tanh(hls::FIFO &data, hls::FIFO &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_tanh_table(tanh_table); + initialized = true; + } + +TanHActLoop: + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + ////PRAGMA_DATA_PACK(out_data) + + TanHPackLoop: + #pragma HLS loop unroll + for (int j = 0; j < res_T::size; j++) { + int data_round = in_data[j] * CONFIG_T::table_size / 8; + int index = data_round + 4 * CONFIG_T::table_size / 8; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = tanh_table[index]; + } + + res.write(out_data); + } +} + +// ************************************************* +// UnaryLUT Activation +// ************************************************* + +template +void unary_lut(hls::FIFO &data, hls::FIFO &res, typename CONFIG_T::table_t table[CONFIG_T::table_size]) { + //#pragma HLS function_instantiate variable=table + #pragma HLS memory partition argument(table) type(complete) + +UnaryLUTActLoop: + #pragma HLS loop pipeline II(CONFIG_T::reuse_factor)// rewind + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + + UnaryLUTPackLoop: + #pragma HLS loop unroll + for (int j = 0; j < res_T::size; j++) { + unsigned index = get_index_unary_lut(in_data[j].V); + out_data[j] = table[index]; + } + + res.write(out_data); + } +} + +// ************************************************* +// Hard sigmoid Activation +// ************************************************* + +template void hard_sigmoid(hls::FIFO &data, hls::FIFO &res) { + +HardSigmoidActLoop: + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + + HardSigmoidPackLoop: + #pragma HLS loop unroll + for (int j = 0; j < res_T::size; j++) { + auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; + if (datareg > 1) + datareg = 1; + else if (datareg < 0) + datareg = 0; + out_data[j] = datareg; + } + + res.write(out_data); + } +} + +template void hard_tanh(hls::FIFO &data, hls::FIFO &res) { + +HardSigmoidActLoop: + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + + HardSigmoidPackLoop: + #pragma HLS loop unroll + for (int j = 0; j < res_T::size; j++) { + auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; + if (sigmoid > 1) + sigmoid = 1; + else if (sigmoid < 0) + sigmoid = 0; + out_data[j] = 2 * sigmoid - 1; + } + + res.write(out_data); + } +} + +// ************************************************* +// Leaky RELU Activation +// ************************************************* + +template +void leaky_relu(hls::FIFO &data, param_T alpha, hls::FIFO &res) { +LeakyReLUActLoop: + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + + LeakyReLUPackLoop: + #pragma HLS loop unroll + for (int j = 0; j < res_T::size; j++) { + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = alpha * in_data[j]; + } + res.write(out_data); + } +} + +// ************************************************* +// Thresholded RELU Activation +// ************************************************* + +template +void thresholded_relu(hls::FIFO &data, param_T theta, hls::FIFO &res) { +ThresholdedReLUActLoop: + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + + ThresholdedReLUPackLoop: + #pragma HLS loop unroll + for (int j = 0; j < res_T::size; j++) { + if (in_data[j] > theta) + out_data[j] = in_data[j]; + else + out_data[j] = 0; + } + + res.write(out_data); + } +} + +// ************************************************* +// Softplus Activation +// ************************************************* + +template void softplus(hls::FIFO &data, hls::FIFO &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_softplus_table(softplus_table); + initialized = true; + } + +SoftplusActLoop: + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + + SoftplusPackLoop: + #pragma HLS loop unroll + for (int j = 0; j < res_T::size; j++) { + int data_round = in_data[j] * CONFIG_T::table_size / 16; + int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = softplus_table[index]; + } + res.write(out_data); + } +} + +// ************************************************* +// Softsign Activation +// ************************************************* + +template void softsign(hls::FIFO &data, hls::FIFO &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_softsign_table(softsign_table); + initialized = true; + } + +SoftsignActLoop: + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + + SoftsignPackLoop: + #pragma HLS loop unroll + for (int j = 0; j < res_T::size; j++) { + int data_round = in_data[j] * CONFIG_T::table_size / 16; + int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = softsign_table[index]; + } + res.write(out_data); + } +} + +// ************************************************* +// ELU Activation +// ************************************************* +template +void elu(hls::FIFO &data, param_T alpha, hls::FIFO &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t elu_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t elu_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_elu_table(elu_table); + initialized = true; + } + +EluActLoop: + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + + EluPackLoop: + #pragma HLS loop unroll + for (int j = 0; j < res_T::size; j++) { + + typename data_T::value_type datareg = in_data[j]; + if (datareg >= 0) { + out_data[j] = datareg; + } else { + int index = datareg * CONFIG_T::table_size / -8; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = alpha * elu_table[index]; + } + } + res.write(out_data); + } +} + +template void elu(hls::FIFO &data, hls::FIFO &res) { + elu, res_T, CONFIG_T>(data, 1.0, res); +} + +// ************************************************* +// SELU Activation +// ************************************************* + +template void selu(hls::FIFO &data, hls::FIFO &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t selu_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t selu_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_selu_table(selu_table); + initialized = true; + } + +SeluActLoop: + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + + SeluPackLoop: + #pragma HLS loop unroll + for (int j = 0; j < res_T::size; j++) { + + typename data_T::value_type datareg = in_data[j]; + if (datareg >= 0) { + out_data[j] = (typename data_T::value_type)1.0507009873554804934193349852946 * datareg; + } else { + int index = datareg * CONFIG_T::table_size / -8; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = selu_table[index]; + } + } + res.write(out_data); + } +} + +// ************************************************* +// PReLU Activation +// ************************************************* + +template +void prelu(hls::FIFO &data, const param_T alpha[CONFIG_T::n_in], hls::FIFO &res) { +PReLUActLoop: + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + + PReLUPackLoop: + #pragma HLS loop unroll + for (int j = 0; j < res_T::size; j++) { + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = alpha[i * res_T::size + j] * in_data[j]; + } + res.write(out_data); + } +} + +// ************************************************* +// Binary TanH Activation +// ************************************************* +template void binary_tanh(hls::FIFO &data, hls::FIFO &res) { +PReLUActLoop: + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + + PReLUPackLoop: + #pragma HLS loop unroll + for (int j = 0; j < res_T::size; j++) { + if (in_data[j] > 0) + out_data[j] = (typename res_T::value_type)1; + else + out_data[j] = (typename res_T::value_type) - 1; + } + res.write(out_data); + } +} + +// ************************************************* +// Ternary TanH Activation +// ************************************************* +template void ternary_tanh(hls::FIFO &data, hls::FIFO &res) { +PReLUActLoop: + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + + PReLUPackLoop: + #pragma HLS loop unroll + for (int j = 0; j < res_T::size; j++) { + if (in_data[j] > 1) + out_data[j] = (typename res_T::value_type)1; + else if (in_data[j] <= -1) + out_data[j] = (typename res_T::value_type) - 1; + else + out_data[j] = (typename res_T::value_type)0; + } + res.write(out_data); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/libero/nnet_utils/nnet_code_gen.h b/hls4ml/templates/libero/nnet_utils/nnet_code_gen.h new file mode 100644 index 0000000000..30953da7bf --- /dev/null +++ b/hls4ml/templates/libero/nnet_utils/nnet_code_gen.h @@ -0,0 +1,28 @@ +#ifndef NNET_INSTR_GEN_H_ +#define NNET_INSTR_GEN_H_ + +#include "nnet_conv1d_latency.h" +#include "nnet_helpers.h" + +#include "nnet_common.h" +#include "nnet_function_stubs.h" +#include "nnet_mult.h" +#include + +namespace nnet { + +template class PointwiseConv1D { + public: + static void pointwise_conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + // To be implemented in subclasses + } +}; + +// hls4ml insert code + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/libero/nnet_utils/nnet_common.h b/hls4ml/templates/libero/nnet_utils/nnet_common.h new file mode 100644 index 0000000000..64e3482ea7 --- /dev/null +++ b/hls4ml/templates/libero/nnet_utils/nnet_common.h @@ -0,0 +1,65 @@ +#ifndef NNET_COMMON_H_ +#define NNET_COMMON_H_ + +#include "nnet_helpers.h" +#include + +// This is a substitute for "ceil(n/(float)d)". +#define DIV_ROUNDUP(n, d) ((n + d - 1) / d) +#define MIN(n, d) (n > d ? d : n) +#define MAX(n, d) (n > d ? n : d) + +namespace nnet { + +// Common type definitions +enum io_type { io_parallel = 0, io_stream }; +enum strategy { latency, resource }; + +/* --- + * Balanced tree reduce implementation. + * For use in scenarios where Vivado cannot expression balance + * Reduces an array of inputs to a single value using the template binary operator 'Op', + * for example summing all elements with Op_add, or finding the maximum with Op_max + * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section + * before applying and accumulate the result over the rolled dimension. + * --- */ +template T reduce(const T *x, Op op) { + static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0; + static constexpr int rightN = N - leftN > 0 ? N - leftN : 0; + if (N == 1) { + return x[0]; + } + if (N == 2) { + return op(x[0], x[1]); + } + return op(reduce(x, op), reduce(x + leftN, op)); +} + +template class Op_add { + public: + T operator()(T a, T b) { return a + b; } +}; + +template class Op_and { + public: + T operator()(T a, T b) { return a && b; } +}; + +template class Op_or { + public: + T operator()(T a, T b) { return a || b; } +}; + +template class Op_max { + public: + T operator()(T a, T b) { return a >= b ? a : b; } +}; + +template class Op_min { + public: + T operator()(T a, T b) { return a <= b ? a : b; } +}; + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/libero/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/libero/nnet_utils/nnet_conv1d_latency.h new file mode 100644 index 0000000000..6caa6166f3 --- /dev/null +++ b/hls4ml/templates/libero/nnet_utils/nnet_conv1d_latency.h @@ -0,0 +1,167 @@ +#ifndef NNET_CONV1D_LATENCY_H_ +#define NNET_CONV1D_LATENCY_H_ + +#include "nnet_common.h" +#include "nnet_mult.h" +#include + +namespace nnet { + +template +void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan; + constexpr unsigned mult_n_out = CONFIG_T::n_filt; + + data_T data_buf[CONFIG_T::n_pixels][mult_n_in]; + //#pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0 + + typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out]; + //#pragma HLS ARRAY_PARTITION variable=mult complete + + typename CONFIG_T::accum_t acc[mult_n_out]; + //#pragma HLS ARRAY_PARTITION variable=acc complete + + //#pragma HLS ARRAY_PARTITION variable=weights complete + //#pragma HLS ARRAY_PARTITION variable=biases complete + + // Limit multipliers to control parallelization + //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit + +PartitionLoop: + #pragma HLS loop pipeline II(CONFIG_T::reuse_factor)// rewind + for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) { + + CONFIG_T::template fill_buffer::fill_buffer(data, data_buf, i_part); + + PixelLoop: + #pragma HLS loop unroll + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + + data_T cache; + + // Do the matrix-multiply + Product1: + #pragma HLS loop unroll + for (int i_in = 0; i_in < mult_n_in; i_in++) { + cache = data_buf[i_pxl][i_in]; + Product2: + #pragma HLS loop unroll + for (int i_out = 0; i_out < mult_n_out; i_out++) { + mult[i_in * mult_n_out + i_out] = + CONFIG_T::mult_config::template product::product( + cache, weights[i_in * mult_n_out + i_out]); + } + } + + // Initialize accumulator with input biases + ResetAccum: + #pragma HLS loop unroll + for (int i_acc = 0; i_acc < mult_n_out; i_acc++) { + acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc]; + } + + // Accumulate multiplication result + Accum1: + #pragma HLS loop unroll + for (int i_in = 0; i_in < mult_n_in; i_in++) { + Accum2: + #pragma HLS loop unroll + for (int i_out = 0; i_out < mult_n_out; i_out++) { + acc[i_out] += mult[i_in * mult_n_out + i_out]; + } + } + + // Cast to "res_t" type + Result: + #pragma HLS loop unroll + for (int i_res = 0; i_res < mult_n_out; i_res++) { + res[i_part * CONFIG_T::n_pixels * mult_n_out + i_pxl * mult_n_out + i_res] = + cast(acc[i_res]); + } + } + } +} + +template +void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_width == 1); + + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt]; + + //#pragma HLS ARRAY_PARTITION variable=mult complete dim=0 + //#pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + //#pragma HLS function_instantiate variable=weights,biases + + // Parallel mode + //#pragma HLS ARRAY_PARTITION variable=weights complete dim=0 + //#pragma HLS ARRAY_PARTITION variable=biases complete dim=0 + + // Limit multipliers to control parallelization + //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit + +// Convolve, saving all multiplication results to accumulate later +ConvOut: + #pragma HLS loop pipeline II(CONFIG_T::reuse_factor) + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + ConvFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: + #pragma HLS loop unroll + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + int index_weight = cc * CONFIG_T::n_filt + ff; + int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; + + if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left || + (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { + mult[index_mult] = 0; + } else { + mult[index_mult] = CONFIG_T::mult_config::template product::product( + data[index_data], weights[index_weight]); + } + } // end channel loop + } // end filter loop + } // end output loop + + // Initialize accumulator with input biases + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + #pragma HLS loop unroll + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + acc[ii][ff] = biases[ff]; + } + } + +// Accumulate multiplication result +AccumOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + AccumFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Do "dot product" sum within filter and sum over channels + AccumChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + acc[ii][ff] += mult[index_mult]; + } // end channel loop + } // end filter loop + } // end output loop + + // Cast to "res_t" type + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + #pragma HLS loop unroll + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + res[ii * CONFIG_T::n_filt + ff] = cast(acc[ii][ff]); + } + } +} + +} // namespace nnet +#endif diff --git a/hls4ml/templates/libero/nnet_utils/nnet_dense.h b/hls4ml/templates/libero/nnet_utils/nnet_dense.h new file mode 100644 index 0000000000..0548dab794 --- /dev/null +++ b/hls4ml/templates/libero/nnet_utils/nnet_dense.h @@ -0,0 +1,82 @@ +#ifndef NNET_DENSE_H_ +#define NNET_DENSE_H_ + +#include "nnet_common.h" +#include "nnet_dense_latency.h" +#include "nnet_dense_resource.h" +#include "nnet_function_stubs.h" +#include "nnet_helpers.h" +#include "nnet_mult.h" +#include +#include + +namespace nnet { + +struct dense_config { + // Internal data type definitions + typedef float bias_t; + typedef float weight_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 10; + static const unsigned n_out = 10; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned strategy = latency; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + static const unsigned n_zeros = 0; + + template using kernel = nnet::DenseKernel; + + // Partitioning arrays cyclically to go with roll factors? + + // Product function to use + template using product = nnet::product::mult; +}; + +template +void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + #pragma HLS function inline + CONFIG_T::template kernel::dense(data, res, weights, biases); +} + +template class DenseLatency : public DenseKernel { + public: + static void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + #pragma HLS function inline + dense_latency(data, res, weights, biases); + } +}; + +template +class DenseResource_rf_leq_nin : public DenseKernel { + public: + static void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + #pragma HLS function inline + dense_resource_rf_leq_nin(data, res, weights, biases); + } +}; + +template +class DenseResource_rf_gt_nin_rem0 : public DenseKernel { + public: + static void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + #pragma HLS function inline + dense_resource_rf_gt_nin_rem0(data, res, weights, biases); + } +}; + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/libero/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/libero/nnet_utils/nnet_dense_compressed.h new file mode 100644 index 0000000000..de8743d712 --- /dev/null +++ b/hls4ml/templates/libero/nnet_utils/nnet_dense_compressed.h @@ -0,0 +1,89 @@ +#ifndef NNET_COMPRESSED_LAYER_H_ +#define NNET_COMPRESSED_LAYER_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" +#include +#include + +namespace nnet { + +template +void fill_mult(typename CONFIG_T::index_t index, typename CONFIG_T::accum_t mult[CONFIG_T::n_out], + typename CONFIG_T::accum_t weight) { + #pragma HLS loop unroll + for (unsigned k = 0; k < CONFIG_T::n_out; k++) { + if (k == index) + mult[k] += weight; + } +} + +template +void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + #pragma HLS memory partition argument(biases) type(complete) + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_nonzeros, CONFIG_T::reuse_factor); + + #pragma HLS memory partition variable(acc) type(complete) + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + //#pragma HLS ARRAY_RESHAPE variable=weights block factor=multiplier_limit + + //#ifdef __VITIS_HLS__ + // #pragma HLS AGGREGATE variable=weights + //#else + // #pragma HLS data_pack variable=weights struct_level + //#endif + +InitAccum: + #pragma HLS loop unroll + for (unsigned i = 0; i < CONFIG_T::n_out; i++) { + acc[i] = (typename CONFIG_T::accum_t)(biases[i]); + } + + // Do the compressed matrix-multiply + const int rufactor = CONFIG_T::reuse_factor; +ReuseLoop: + for (unsigned ir = 0; ir < rufactor; ir++) { + + #pragma HLS memory partition variable(mult) type(complete) + typename CONFIG_T::accum_t mult[CONFIG_T::n_out]; + + ResetMult: + #pragma HLS loop unroll + for (int imult = 0; imult < CONFIG_T::n_out; imult++) { + mult[imult] = 0; + } + + CompressedMultLoop: + #pragma HLS loop unroll + for (unsigned im = 0; im < multiplier_limit; im++) { + unsigned w = im * rufactor + ir; + auto row = weights[w].row_index; + auto col = weights[w].col_index; + auto weight_cache = weights[w].weight; + data_T data_cache = data[row]; + // mult[col] += weight_cache * data_cache; + typename CONFIG_T::accum_t prod = + CONFIG_T::template product::product(data_cache, weight_cache); + fill_mult(col, mult, prod); + } + + for (int im = 0; im < CONFIG_T::n_out; im++) { + acc[im] += mult[im]; + } + } + +// Cast to "res_t" type +ResultLoop: + #pragma HLS loop unroll + for (unsigned i = 0; i < CONFIG_T::n_out; i++) { + // res[i] = (res_T) (acc[i]); + res[i] = cast(acc[i]); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/libero/nnet_utils/nnet_dense_latency.h b/hls4ml/templates/libero/nnet_utils/nnet_dense_latency.h new file mode 100644 index 0000000000..a241857450 --- /dev/null +++ b/hls4ml/templates/libero/nnet_utils/nnet_dense_latency.h @@ -0,0 +1,72 @@ +#ifndef NNET_DENSE_LATENCY_H_ +#define NNET_DENSE_LATENCY_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" +#include "nnet_mult.h" +#include +#include + +namespace nnet { + +template +void dense_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + #pragma HLS function pipeline II(CONFIG_T::reuse_factor) + #pragma HLS memory partition argument(biases) type(complete) + data_T cache; + #pragma HLS memory partition variable(mult) type(complete) + typename CONFIG_T::accum_t mult[CONFIG_T::n_in * CONFIG_T::n_out]; + #pragma HLS memory partition variable(acc) type(complete) + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + //#pragma HLS function_instantiate variable=weights,biases + + // For parallel inputs: + // - completely partition arrays -- target fabric + // - if we have an unroll factor, limit number of multipliers + + // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes + + //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit + +// Do the matrix-multiply +Product1: + for (uint ii = 0; ii < CONFIG_T::n_in; ii++) { + cache = data[ii]; + Product2: + for (uint jj = 0; jj < CONFIG_T::n_out; jj++) { + int index = ii * CONFIG_T::n_out + jj; + mult[index] = CONFIG_T::template product::product(cache, weights[index]); + } + } + +// Initialize accumulator with input biases +ResetAccum: + for (uint iacc = 0; iacc < CONFIG_T::n_out; iacc++) { + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + +// Accumulate multiplication result +Accum1: + for (uint ii = 0; ii < CONFIG_T::n_in; ii++) { + Accum2: + for (uint jj = 0; jj < CONFIG_T::n_out; jj++) { + int index = ii * CONFIG_T::n_out + jj; + acc[jj] += mult[index]; + } + } + +// Cast to "res_t" type +Result: + for (uint ires = 0; ires < CONFIG_T::n_out; ires++) { + // res[ires] = (res_T) (acc[ires]); + res[ires] = cast(acc[ires]); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/libero/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/libero/nnet_utils/nnet_dense_resource.h new file mode 100644 index 0000000000..2c21e2f845 --- /dev/null +++ b/hls4ml/templates/libero/nnet_utils/nnet_dense_resource.h @@ -0,0 +1,270 @@ +#ifndef NNET_DENSE_RESOURCE_H_ +#define NNET_DENSE_RESOURCE_H_ + +#include "nnet_common.h" +#include "nnet_mult.h" +#include +#include +#include + +namespace nnet { + +template +void dense_resource_rf_leq_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + #pragma HLS memory partition argument(biases) type(complete) + + const int rufactor = CONFIG_T::reuse_factor; + const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor); + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor); + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor); + const int multscale = multiplier_limit / CONFIG_T::n_out; + const int nin = CONFIG_T::n_in; + const int nout = CONFIG_T::n_out; + + assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); + assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN"); + + //#pragma HLS function_instantiate variable=weights,biases + //#pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + + // if (CONFIG_T::reuse_factor > 1) { + // #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM + // } + + #pragma HLS memory partition variable(acc) type(complete) + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + +InitAccum: + #pragma HLS loop unroll + for (int iacc = 0; iacc < nout; iacc++) { + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + +ReuseLoop: + #pragma HLS loop pipeline II(1)// rewind + for (int ir = 0; ir < rufactor; ir++) { + + int w_index = ir; + int in_index = ir; + int out_index = 0; + int acc_step = 0; + + MultLoop: + #pragma HLS loop unroll + for (int im = 0; im < block_factor; im++) { + + acc[out_index] += static_cast( + CONFIG_T::template product::product(data[in_index], weights[w_index])); + + // Increment w_index + w_index += rufactor; + // Increment in_index + in_index += rufactor; + if (in_index >= nin) { + in_index = ir; + } + // Increment out_index + if (acc_step + 1 >= multscale) { + acc_step = 0; + out_index++; + } else { + acc_step++; + } + } + } + +// Cast to "res_t" type +Result: + #pragma HLS loop unroll + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + res[ires] = cast(acc[ires]); + } +} + +template +void dense_resource_rf_gt_nin_rem0(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + #pragma HLS memory partition argument(biases) type(complete) + const int rufactor = MIN(CONFIG_T::reuse_factor, CONFIG_T::n_in * CONFIG_T::n_out); + const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor); + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor); + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor); + const int multscale = multiplier_limit / CONFIG_T::n_out; + const int nin = CONFIG_T::n_in; + const int nout = CONFIG_T::n_out; + + assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); + assert((rufactor > nin && rufactor % nin == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0"); + + //#pragma HLS function_instantiate variable=weights,biases + //#pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + + // if (CONFIG_T::reuse_factor > 1) { + // #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM + // } + + #pragma HLS memory partition variable(acc) type(complete) + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + +InitAccum: + #pragma HLS loop unroll + for (int iacc = 0; iacc < nout; iacc++) { + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + + int w_index; + int in_index = 0; + int out_index; + int outstep = 0; + const int outscale = rufactor / nin; + + int outidx[rufactor]; +IndexLoop: + for (int ir = 0; ir < rufactor; ir++) { + outidx[ir] = outstep; + if ((ir + 1) % nin == 0) { + outstep++; + } + } + +ReuseLoop: + #pragma HLS loop pipeline II(1)// rewind + for (int ir = 0; ir < rufactor; ir++) { + + w_index = ir; + out_index = outidx[ir] /*outstep*/; + + MultLoop: + #pragma HLS loop unroll + for (int im = 0; im < block_factor; im++) { + acc[out_index] += static_cast( + CONFIG_T::template product::product(data[in_index], weights[w_index])); + + w_index += rufactor; + if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out) + break; // check out of bounds + out_index += outscale; + } + + in_index++; + if (in_index >= nin) { + in_index = 0; + // outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround. + } + } + +// Cast to "res_t" type +Result: + #pragma HLS loop unroll + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + res[ires] = cast(acc[ires]); + } +} + +template +void dense_resource_rf_gt_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + #pragma HLS memory partition argument(biases) type(complete) + const int rufactor = CONFIG_T::reuse_factor; + const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor); + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor); + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor); + const int multscale = multiplier_limit / CONFIG_T::n_out; + const int nin = CONFIG_T::n_in; + const int nout = CONFIG_T::n_out; + + assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); + assert((rufactor > nin) && "This function is correct only for RF > N_IN"); + + //#pragma HLS function_instantiate variable=weights,biases + //#pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + + // if (CONFIG_T::reuse_factor > 1) { + // #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM + // } + + #pragma HLS memory partition variable(acc) type(complete) + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + +InitAccum: + #pragma HLS loop unroll + for (int iacc = 0; iacc < nout; iacc++) { + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + +ReuseLoop: + #pragma HLS loop pipeline II(1)// rewind + for (int ir = 0; ir < rufactor; ir++) { + #pragma HLS memory partition variable(tmpmult) type(complete) + typename CONFIG_T::accum_t tmpmult[block_factor]; + + MultLoop: + #pragma HLS loop unroll + for (int im = 0; im < block_factor; im++) { + int w_index = ir + rufactor * im; + int in_index = w_index % nin; + if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out) + continue; // check out of bounds + tmpmult[im] = + CONFIG_T::template product::product(data[in_index], weights[w_index]); + } + + #pragma HLS memory partition variable(mult) type(complete) + typename CONFIG_T::accum_t mult[multiplier_limit]; + + ResetMult: + #pragma HLS loop unroll + for (int imult = 0; imult < multiplier_limit; imult++) { + mult[imult] = 0; + } + + AccumLoop1: + #pragma HLS loop unroll + for (int im = 0; im < block_factor; im++) { + int w_index = ir + rufactor * im; + int out_index = w_index / multfactor; + if (out_index >= multiplier_limit) + continue; // check out of bounds + mult[out_index] += tmpmult[im]; + } + + AccumLoop2: + #pragma HLS loop unroll + for (int im = 0; im < multiplier_limit; im++) { + // int out_index = im/multscale; // This is the general case + // acc[out_index] += mult[im]; + acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out + } + } + +// Cast to "res_t" type +Result: + #pragma HLS loop unroll + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + res[ires] = cast(acc[ires]); + } +} + +template +void dense_resource(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { + dense_resource_rf_leq_nin(data, res, weights, biases); + } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) { + dense_resource_rf_gt_nin_rem0(data, res, weights, biases); + } else { + dense_resource_rf_gt_nin(data, res, weights, biases); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/libero/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/libero/nnet_utils/nnet_dense_stream.h new file mode 100644 index 0000000000..194eb018ae --- /dev/null +++ b/hls4ml/templates/libero/nnet_utils/nnet_dense_stream.h @@ -0,0 +1,105 @@ +#ifndef NNET_DENSE_STREAM_H_ +#define NNET_DENSE_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_types.h" +#include +#include +#include + +namespace nnet { + +template +void dense_latency_wrapper(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + #pragma HLS function pipeline II(CONFIG_T::reuse_factor) + dense_latency(data, res, weights, biases); +} + +template +void dense_resource_wrapper(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + dense_resource(data, res, weights, biases); +} + +template +void data_prepare(hls::FIFO &data_stream, typename data_T::value_type data[CONFIG_T::n_in]) { + #pragma HLS function inline + + if (CONFIG_T::n_in / data_T::size > 1) { + DataPrepare: + #pragma HLS loop pipeline + for (int i_in = 0; i_in < CONFIG_T::n_in / data_T::size; i_in++) { + data_T data_pack = data_stream.read(); + DataPackPipeline: + #pragma HLS loop unroll + for (int i_pack = 0; i_pack < data_T::size; i_pack++) { + data[i_in * data_T::size + i_pack] = data_pack[i_pack]; + } + } + } else { + data_T data_pack = data_stream.read(); + DataPackSingle: + #pragma HLS loop unroll + for (int i_pack = 0; i_pack < data_T::size; i_pack++) { + data[i_pack] = data_pack[i_pack]; + } + } +} + +template +void res_write(typename res_T::value_type res[CONFIG_T::n_out], hls::FIFO &res_stream) { + #pragma HLS function inline + + if (CONFIG_T::n_out / res_T::size > 1) { + ResWrite: + #pragma HLS loop pipeline + for (unsigned i_out = 0; i_out < CONFIG_T::n_out / res_T::size; i_out++) { + res_T res_pack; + // PRAGMA_DATA_PACK(res_pack) + ResPackPipeline: + #pragma HLS loop unroll + for (int i_pack = 0; i_pack < res_T::size; i_pack++) { + res_pack[i_pack] = res[i_out * res_T::size + i_pack]; + } + res_stream.write(res_pack); + } + } else { + res_T res_pack; + // PRAGMA_DATA_PACK(res_pack) + ResPackSingle: + #pragma HLS loop unroll + for (int i_pack = 0; i_pack < res_T::size; i_pack++) { + res_pack[i_pack] = res[i_pack]; + } + res_stream.write(res_pack); + } +} + +template +void dense(hls::FIFO &data_stream, hls::FIFO &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + #pragma HLS function inline + + #pragma HLS memory partition variable(data) type(complete) + typename data_T::value_type data[CONFIG_T::n_in]; + + #pragma HLS memory partition variable(res) type(complete) + typename res_T::value_type res[CONFIG_T::n_out]; + + data_prepare(data_stream, data); + if (CONFIG_T::strategy == nnet::latency) { + dense_latency_wrapper(data, res, weights, biases); + } else { + dense_resource_wrapper(data, res, weights, + biases); + } + res_write(res, res_stream); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/libero/nnet_utils/nnet_function_stubs.h b/hls4ml/templates/libero/nnet_utils/nnet_function_stubs.h new file mode 100644 index 0000000000..c42b28a463 --- /dev/null +++ b/hls4ml/templates/libero/nnet_utils/nnet_function_stubs.h @@ -0,0 +1,51 @@ +#ifndef NNET_FUNCTION_STUBS_H_ +#define NNET_FUNCTION_STUBS_H_ + +#include "nnet_helpers.h" + +#include "nnet_common.h" +#include "nnet_mult.h" +#include + +namespace nnet { + +template class FillConv1DBuffer { + public: + static void fill_buffer(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan], + const unsigned partition) { + // To be implemented in subclasses + } +}; + +template class FillConv2DBuffer { + public: + static void + fill_buffer(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + const unsigned partition) { + // To be implemented in subclasses + } +}; + +template class DenseKernel { + public: + static void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + // To be implemented in subclasses + } +}; + +template class Conv1DKernel { + public: + static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + // To be implemented in subclasses + } +}; + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/libero/nnet_utils/nnet_helpers.h b/hls4ml/templates/libero/nnet_utils/nnet_helpers.h new file mode 100644 index 0000000000..a37b1243c7 --- /dev/null +++ b/hls4ml/templates/libero/nnet_utils/nnet_helpers.h @@ -0,0 +1,279 @@ +#ifndef NNET_HELPERS_H +#define NNET_HELPERS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nnet { + +#ifndef __SYNTHESIS__ + +#ifndef WEIGHTS_DIR +#define WEIGHTS_DIR "firmware/weights" +#endif + +template void load_weights_from_txt(T *w, const char *fname) { + std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname); + std::ifstream infile(full_path.c_str(), std::ios::binary); + + if (infile.fail()) { + std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl; + exit(1); + } + + std::string line; + if (std::getline(infile, line)) { + std::istringstream iss(line); + std::string token; + + size_t i = 0; + while (std::getline(iss, token, ',')) { + w[i] = T(std::stof(token.c_str())); + i++; + } + + if (SIZE != i) { + std::cerr << "ERROR: Expected " << SIZE << " values"; + std::cerr << " but read only " << i << " values" << std::endl; + } + } +} + +template void load_exponent_weights_from_txt(T *w, const char *fname) { + + std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname); + std::ifstream infile(full_path.c_str(), std::ios::binary); + + if (infile.fail()) { + std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl; + exit(1); + } + + std::string line; + if (std::getline(infile, line)) { + std::istringstream iss(line); + std::string token; + std::string extra_chars = "} "; + + size_t i = 0; + while (std::getline(iss, token, '{')) { + if (token.length() == 0) { + continue; + } + for (char c : extra_chars) { + token.erase(std::remove(token.begin(), token.end(), c), token.end()); + } + if (token.back() == ',') { + token.erase(token.end() - 1); + } + + std::replace(token.begin(), token.end(), ',', ' '); + std::istringstream structss(token); + + if (!(structss >> w[i].sign >> w[i].weight)) { + std::cerr << "ERROR: Unable to parse file " << std::string(fname); + exit(1); + } + i++; + } + + if (SIZE != i) { + std::cerr << "ERROR: Expected " << SIZE << " values"; + std::cerr << " but read only " << i << " values" << std::endl; + } + } +} +template void convert_data(srcType *src, dstType *dst) { + for (size_t i = 0; i < SIZE; i++) { + dst[i] = dstType(src[i]); + } +} + +template void convert_data(srcType *src, hls::FIFO &dst) { + for (size_t i = 0; i < SIZE / dstType::size; i++) { + dstType ctype; + for (size_t j = 0; j < dstType::size; j++) { + ctype[j] = typename dstType::value_type(src[i * dstType::size + j]); + } + dst.write(ctype); + } +} + +template void convert_data(hls::FIFO &src, dstType *dst) { + for (size_t i = 0; i < SIZE / srcType::size; i++) { + srcType ctype = src.read(); + for (size_t j = 0; j < srcType::size; j++) { + dst[i * srcType::size + j] = dstType(ctype[j]); + } + } +} + +extern bool trace_enabled; +extern std::map *trace_outputs; +extern size_t trace_type_size; + +template void save_output_array(data_T *data, save_T *ptr, size_t layer_size) { + for (int i = 0; i < layer_size; i++) { + ptr[i] = save_T(data[i]); + } +} + +template void save_output_array(hls::FIFO &data, save_T *ptr, size_t layer_size) { + for (size_t i = 0; i < layer_size / data_T::size; i++) { + data_T ctype = data.read(); + for (size_t j = 0; j < data_T::size; j++) { + ptr[i * data_T::size + j] = save_T(ctype[j]); + } + data.write(ctype); + } +} + +// We don't want to include save_T in this function because it will be inserted into myproject.cpp +// so a workaround with element size is used +template void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) { + if (!trace_enabled) + return; + + if (trace_outputs) { + if (trace_outputs->count(layer_name) > 0) { + if (trace_type_size == 4) { + save_output_array(data, (float *)(*trace_outputs)[layer_name], layer_size); + } else if (trace_type_size == 8) { + save_output_array(data, (double *)(*trace_outputs)[layer_name], layer_size); + } else { + std::cout << "Unknown trace type!" << std::endl; + } + } else { + std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl; + } + } else { + std::ostringstream filename; + filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data + std::fstream out; + out.open(filename.str(), std::ios::app); + assert(out.is_open()); + for (int i = 0; i < layer_size; i++) { + out << float(data[i]) << " "; // We don't care about precision in text files + } + out << std::endl; + out.close(); + } +} + +template void save_layer_output(hls::FIFO &data, const char *layer_name, size_t layer_size) { + if (!trace_enabled) + return; + + if (trace_outputs) { + if (trace_outputs->count(layer_name) > 0) { + if (trace_type_size == 4) { + save_output_array(data, (float *)(*trace_outputs)[layer_name], layer_size); + } else if (trace_type_size == 8) { + save_output_array(data, (double *)(*trace_outputs)[layer_name], layer_size); + } else { + std::cout << "Unknown trace type!" << std::endl; + } + } else { + std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl; + } + } else { + std::ostringstream filename; + filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data + std::fstream out; + out.open(filename.str(), std::ios::app); + assert(out.is_open()); + for (size_t i = 0; i < layer_size / data_T::size; i++) { + data_T ctype = data.read(); + for (size_t j = 0; j < data_T::size; j++) { + out << float(ctype[j]) << " "; // We don't care about precision in text files + } + data.write(ctype); + } + out << std::endl; + out.close(); + } +} + +#endif + +template void copy_data(std::vector src, dst_T dst[SIZE]) { + typename std::vector::const_iterator in_begin = src.cbegin() + OFFSET; + typename std::vector::const_iterator in_end = in_begin + SIZE; + std::copy(in_begin, in_end, dst); +} + +template +void copy_data(std::vector src, hls::FIFO &dst) { + typename std::vector::const_iterator in_begin = src.cbegin() + OFFSET; + typename std::vector::const_iterator in_end = in_begin + SIZE; + + size_t i_pack = 0; + dst_T dst_pack; + for (typename std::vector::const_iterator i = in_begin; i != in_end; ++i) { + dst_pack[i_pack++] = typename dst_T::value_type(*i); + if (i_pack == dst_T::size) { + i_pack = 0; + dst.write(dst_pack); + } + } +} + +template void copy_data_axi(std::vector src, dst_T dst[SIZE]) { + for (auto i = 0; i < SIZE; i++) + if (i == SIZE - 1) { + dst[i].data = src[i]; + dst[i].last = 1; + } else { + dst[i].data = src[i]; + dst[i].last = 0; + } +} + +template void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) { + for (uint i = 0; i < SIZE; i++) { + out << result[i] << " "; + } + out << std::endl; +} + +template void print_result(hls::FIFO &result, std::ostream &out, bool keep = false) { + for (int i = 0; i < SIZE / res_T::size; i++) { + res_T res_pack = result.read(); + for (int j = 0; j < res_T::size; j++) { + out << res_pack[j] << " "; + } + if (keep) + result.write(res_pack); + } + out << std::endl; +} + +template void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); } + +template void fill_zero(hls::FIFO &data) { + for (int i = 0; i < SIZE / data_T::size; i++) { + data_T data_pack; + for (int j = 0; j < data_T::size; j++) { + data_pack[j] = 0.; + } + data.write(data_pack); + } +} + +constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); } + +constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); } + +constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); } + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/libero/nnet_utils/nnet_mult.h b/hls4ml/templates/libero/nnet_utils/nnet_mult.h new file mode 100644 index 0000000000..02418b4618 --- /dev/null +++ b/hls4ml/templates/libero/nnet_utils/nnet_mult.h @@ -0,0 +1,118 @@ +#ifndef NNET_MULT_H_ +#define NNET_MULT_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" +#include +#include +#include + +namespace nnet { + +namespace product { + +/* --- + * different methods to perform the product of input and weight, depending on the + * types of each. + * --- */ + +class Product {}; + +template class both_binary : public Product { + public: + static x_T product(x_T a, w_T w) { + // specialisation for 1-bit weights and incoming data + #pragma HLS function inline + return a == w; + } +}; + +template class weight_binary : public Product { + public: + static auto product(x_T a, w_T w) -> decltype(-a) { + // Specialisation for 1-bit weights, arbitrary data + #pragma HLS function inline + if (w == 0) + return -a; + else + return a; + } +}; + +template class data_binary : public Product { + public: + static auto product(x_T a, w_T w) -> decltype(-w) { + // Specialisation for 1-bit data, arbitrary weight + #pragma HLS function inline + if (a == 0) + return -w; + else + return w; + } +}; + +template class weight_ternary : public Product { + public: + static auto product(x_T a, w_T w) -> decltype(-a) { + // Specialisation for 2-bit weights, arbitrary data + #pragma HLS function inline + if (w == 0) + return 0; + else if (w == -1) + return -a; + else + return a; // if(w == 1) + } +}; + +template class mult : public Product { + public: + static auto product(x_T a, w_T w) -> decltype(a * w) { + // 'Normal' product + #pragma HLS function inline + return a * w; + } +}; + +template class weight_exponential : public Product { + public: + using r_T = hls::ap_fixpt<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width)>; + static r_T product(x_T a, w_T w) { + // Shift product for exponential weights + #pragma HLS function inline + + // Shift by the exponent. Negative weights shift right + r_T y = static_cast(a) << w.weight; + + // Negate or not depending on weight sign + return w.sign == 1 ? y : static_cast(-y); + } +}; + +} // namespace product + +template +inline typename std::enable_if>::value && + std::is_same>::value, + hls::ap_int>::type +cast(typename CONFIG_T::accum_t x) { + return (hls::ap_int)(x - CONFIG_T::n_in / 2) * 2; +} + +template +inline typename std::enable_if>::value && + !std::is_same>::value, + res_T>::type +cast(typename CONFIG_T::accum_t x) { + return (res_T)x; +} + +template +inline typename std::enable_if<(!std::is_same>::value), res_T>::type +cast(typename CONFIG_T::accum_t x) { + return (res_T)x; +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/libero/nnet_utils/nnet_stream.h b/hls4ml/templates/libero/nnet_utils/nnet_stream.h new file mode 100644 index 0000000000..d32fbb5f7f --- /dev/null +++ b/hls4ml/templates/libero/nnet_utils/nnet_stream.h @@ -0,0 +1,223 @@ +#ifndef NNET_STREAM_H +#define NNET_STREAM_H + +#include "nnet_common.h" +#include + +namespace nnet { + +struct broadcast_config { + static const unsigned in_height = 1; + static const unsigned in_width = 1; + static const unsigned in_chan = 3; + static const unsigned out_height = 2; + static const unsigned out_width = 2; + static const unsigned out_chan = 3; +}; + +template +void clone_stream(hls::FIFO &data, hls::FIFO &res1, hls::FIFO &res2) { +CloneLoop: + #pragma HLS loop pipeline + for (int i = 0; i < N / data_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data1; + res_T out_data2; + // PRAGMA_DATA_PACK(out_data1) + // PRAGMA_DATA_PACK(out_data2) + + ClonePack: + #pragma HLS loop unroll + for (int j = 0; j < data_T::size; j++) { + out_data1[j] = in_data[j]; + out_data2[j] = in_data[j]; + } + + res1.write(out_data1); + res2.write(out_data2); + } +} + +template +void clone_stream(hls::FIFO &data, hls::FIFO &res1, hls::FIFO &res2, hls::FIFO &res3) { +CloneLoop: + #pragma HLS loop pipeline + for (int i = 0; i < N / data_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data1; + res_T out_data2; + res_T out_data3; + // PRAGMA_DATA_PACK(out_data1) + // PRAGMA_DATA_PACK(out_data2) + // PRAGMA_DATA_PACK(out_data3) + + ClonePack: + #pragma HLS loop unroll + for (int j = 0; j < data_T::size; j++) { + out_data1[j] = in_data[j]; + out_data2[j] = in_data[j]; + out_data3[j] = in_data[j]; + } + + res1.write(out_data1); + res2.write(out_data2); + res3.write(out_data3); + } +} + +template void repack_stream(hls::FIFO &data, hls::FIFO &res) { + if (data_T::size == res_T::size) { + #pragma HLS loop pipeline + for (int i = 0; i < N / data_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + + #pragma HLS loop unroll + for (int j = 0; j < data_T::size; j++) { + out_data[j] = in_data[j]; + } + + res.write(out_data); + } + } else if (data_T::size > res_T::size) { + constexpr unsigned pack_diff = data_T::size / res_T::size; + if (N / data_T::size > 1) { + #pragma HLS loop pipeline + for (int i = 0; i < N / data_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + + #pragma HLS loop pipeline + for (int j = 0; j < pack_diff; j++) { + + res_T out_data; + #pragma HLS loop unroll + for (int k = 0; k < res_T::size; k++) { + out_data[k] = in_data[j * res_T::size + k]; + } + res.write(out_data); + } + } + } else { + for (int i = 0; i < N / data_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + + #pragma HLS loop pipeline + for (int j = 0; j < pack_diff; j++) { + + res_T out_data; + #pragma HLS loop unroll + for (int k = 0; k < res_T::size; k++) { + out_data[k] = in_data[j * res_T::size + k]; + } + res.write(out_data); + } + } + } + } else { // data_T::size < res_T::size + res_T out_data; + constexpr unsigned pack_diff = res_T::size / data_T::size; + unsigned pack_cnt = 0; + #pragma HLS loop pipeline + for (int i = 0; i < N / data_T::size; i++) { + + data_T in_data = data.read(); + #pragma HLS loop unroll + for (int j = 0; j < data_T::size; j++) { + out_data[pack_cnt * data_T::size + j] = in_data[j]; + } + + if (pack_cnt == pack_diff - 1) { + res.write(out_data); + pack_cnt = 0; + } else { + pack_cnt++; + } + } + } +} + +template +void broadcast_stream_1x1xC(hls::FIFO &data, hls::FIFO &res) { + assert(CONFIG_T::in_height == 1 && CONFIG_T::in_width == 1 && CONFIG_T::in_chan == CONFIG_T::out_chan); + int n_dupl = (CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::out_chan) / + (CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan); +BroadcastLoop: + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan / data_T::size; i++) { + data_T in_data = data.read(); + #pragma HLS loop pipeline + for (int j = 0; j < n_dupl; j++) { + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + #pragma HLS loop unroll + for (int k = 0; k < res_T::size; k++) { + out_data[k] = in_data[k]; + } + res.write(out_data); + } + } +} + +template +void broadcast_stream_HxWx1(hls::FIFO &data, hls::FIFO &res) { + assert(CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height && + CONFIG_T::in_width == CONFIG_T::out_width); +BroadcastLoop: + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan / data_T::size; i++) { + data_T in_data = data.read(); + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + #pragma HLS loop unroll + for (int k = 0; k < res_T::size; k++) { + out_data[k] = in_data[0]; + } + res.write(out_data); + } +} + +template +void broadcast_stream(hls::FIFO &data, hls::FIFO &res) { + if (CONFIG_T::in_height == 1 && CONFIG_T::in_width == 1 && CONFIG_T::in_chan == CONFIG_T::out_chan) { + broadcast_stream_1x1xC(data, res); + } else if (CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height && + CONFIG_T::in_width == CONFIG_T::out_width) { + broadcast_stream_HxWx1(data, res); + } +} + +template void transpose_2d(hls::FIFO &data, hls::FIFO &res) { + #pragma HLS memory partition variable(data_array) type(complete) + typename data_T::value_type data_array[CONFIG_T::height * CONFIG_T::width]; + + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / data_T::size; i++) { + data_T in_data = data.read(); + for (int j = 0; j < data_T::size; j++) { + data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]); + } + } + + #pragma HLS loop pipeline + for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / res_T::size; i++) { + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + for (int j = 0; j < res_T::size; j++) { + out_data[j] = typename res_T::value_type(data_array[j * data_T::size + i]); + } + res.write(out_data); + } +} +} // namespace nnet + +#endif diff --git a/hls4ml/templates/libero/nnet_utils/nnet_types.h b/hls4ml/templates/libero/nnet_utils/nnet_types.h new file mode 100644 index 0000000000..16737a6630 --- /dev/null +++ b/hls4ml/templates/libero/nnet_utils/nnet_types.h @@ -0,0 +1,66 @@ +#ifndef NNET_TYPES_H_ +#define NNET_TYPES_H_ + +#include +#include +#include +#include +#include + +namespace nnet { + +// Fixed-size array +template struct array { + typedef T value_type; + static const unsigned size = N; + + T data[N]; + + T &operator[](size_t pos) { return data[pos]; } + + const T &operator[](size_t pos) const { return data[pos]; } + + array &operator=(const array &other) { + if (&other == this) + return *this; + + assert(N == other.size && "Array sizes must match."); + + #pragma HLS loop unroll + for (unsigned i = 0; i < N; i++) { + data[i] = other[i]; + } + return *this; + } +}; + +// Generic lookup-table implementation, for use in approximations of math functions +template class lookup_table { + public: + lookup_table(T from, T to) : range_start(from), range_end(to), base_div(hls::ap_uint<16>(N) / T(to - from)) { + T step = (range_end - range_start) / hls::ap_uint<16>(N); + for (size_t i = 0; i < N; i++) { + T num = range_start + hls::ap_uint<16>(i) * step; + T sample = func(num); + samples[i] = sample; + } + } + + T operator()(T n) const { + int index = (n - range_start) * base_div; + if (index < 0) + index = 0; + else if (index > N - 1) + index = N - 1; + return samples[index]; + } + + private: + T samples[N]; + const T range_start, range_end; + hls::ap_fixpt<20, 16> base_div; +}; + +} // namespace nnet + +#endif diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py index 8de19fe1d2..8c48f79d2d 100644 --- a/hls4ml/writer/__init__.py +++ b/hls4ml/writer/__init__.py @@ -1,4 +1,5 @@ from hls4ml.writer.catapult_writer import CatapultWriter +from hls4ml.writer.libero_writer import LiberoWriter from hls4ml.writer.oneapi_writer import OneAPIWriter from hls4ml.writer.quartus_writer import QuartusWriter from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter @@ -13,4 +14,5 @@ register_writer('Quartus', QuartusWriter) register_writer('oneAPI', OneAPIWriter) register_writer('Catapult', CatapultWriter) +register_writer('Libero', LiberoWriter) register_writer('SymbolicExpression', SymbolicExpressionWriter) diff --git a/hls4ml/writer/libero_writer.py b/hls4ml/writer/libero_writer.py new file mode 100644 index 0000000000..7e6dfd03e9 --- /dev/null +++ b/hls4ml/writer/libero_writer.py @@ -0,0 +1,844 @@ +import glob +import stat +import tarfile +from collections import OrderedDict +from pathlib import Path +from shutil import copyfile + +import numpy as np +import yaml + +from hls4ml.writer.writers import Writer + +config_filename = 'hls4ml_config.yml' + + +class LiberoWriter(Writer): + def print_array_to_cpp(self, var, odir, namespace=None, write_txt_file=True): + """Write a weights array to C++ header files. + + Args: + var (WeightVariable): Weight to write + odir (str): Output directory + namespace (str, optional): Writes a namespace for the weights to avoid clashes with global variables. + write_txt_file (bool, optional): Write txt files in addition to .h files. Defaults to True. + """ + + h_file = open(f'{odir}/firmware/weights/{var.name}.h', 'w') + if write_txt_file: + txt_file = open(f'{odir}/firmware/weights/{var.name}.txt', 'w') + + # meta data + h_file.write(f'//Numpy array shape {var.shape}\n') + h_file.write(f'//Min {np.min(var.min):.12f}\n') + h_file.write(f'//Max {np.max(var.max):.12f}\n') + h_file.write(f'//Number of zeros {var.nzeros}\n') + h_file.write('\n') + + h_file.write(f'#ifndef {var.name.upper()}_H_\n') + h_file.write(f'#define {var.name.upper()}_H_\n') + h_file.write('\n') + + if namespace is not None: + h_file.write(f'namespace {namespace} {{\n\n') + + if write_txt_file: + h_file.write('#ifndef __SYNTHESIS__\n') + h_file.write(var.definition_cpp() + ';\n') + h_file.write('#else\n') + + h_file.write(var.definition_cpp() + ' = {') + + # fill c++ array. + # not including internal brackets for multidimensional case + sep = '' + for x in var: + h_file.write(sep + x) + if write_txt_file: + txt_file.write(sep + x) + sep = ', ' + h_file.write('};\n\n') + + if write_txt_file: + h_file.write('#endif\n') + txt_file.close() + + if namespace is not None: + h_file.write('}\n\n') + + h_file.write('\n#endif\n') + h_file.close() + + def write_project_dir(self, model): + """Write the base project directory + + Args: + model (ModelGraph): the hls4ml model. + """ + out_path = Path(f'{model.config.get_output_dir()}/firmware/weights') + out_path.mkdir(parents=True, exist_ok=True) + + @staticmethod + def _make_array_pragma(variable, is_argument=False): + """ + Layers in ModelGraph can specify output array partitioning through the `pragma` attribute. + If `pragma` is a string: options are 'partition' or 'stream'. + If `pragma` is a tuple: (mode, type, factor) where mode is 'partition', type is + 'complete', 'cyclic', or 'block', and factor is an integer only used when the type is not 'complete'. + """ + + config = variable.pragma + if type(config) is tuple: + mode = config[0] + if mode == 'partition': + typ = config[1] + if typ != 'complete': + factor = config[2] + elif mode == 'stream': + depth = config[1] + else: + mode = config + typ = 'complete' + factor = 0 + + arg_name = 'argument' if is_argument else 'variable' + + if mode == 'partition': + if typ == 'complete': + template = '#pragma HLS memory partition {arg_name}({name}) type({type}) dim({dim})' + else: + template = '#pragma HLS memory partition {arg_name}({name}) type({type}) factor({factor}) dim({dim})' + + return template.format(mode=mode.upper(), name=variable.name, type=typ, factor=factor, dim=0, arg_name=arg_name) + + elif mode == 'stream': + # TODO update for streaming IO + return f'#pragma HLS STREAM {arg_name}={variable.name} depth={depth}' + + def write_project_cpp(self, model): + """Write the main architecture source file (myproject.cpp) + + Args: + model (ModelGraph): the hls4ml model. + """ + + filedir = Path(__file__).parent + prj_name = model.config.get_project_name() + prj_cpp_src = (filedir / '../templates/libero/firmware/myproject.cpp').resolve() + prj_cpp_dst = Path(f'{model.config.get_output_dir()}/firmware/{prj_name}.cpp').resolve() + + model_inputs = model.get_input_variables() + model_outputs = model.get_output_variables() + model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + prj_name = prj_name + + indent = ' ' + + with open(prj_cpp_src) as src, open(prj_cpp_dst, 'w') as dst: + for line in src.readlines(): + # Add headers to weights and biases + if 'myproject' in line: + newline = line.replace('myproject', prj_name) + + elif '// hls-fpga-machine-learning insert header' in line: + inputs_str = ', '.join([i.definition_cpp(as_reference=True) for i in model_inputs]) + outputs_str = ', '.join([o.definition_cpp(as_reference=True) for o in model_outputs]) + brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams]) + + newline = '' + newline += indent + inputs_str + ',\n' + newline += indent + outputs_str + if len(model_brams) > 0: + newline += ',\n' + brams_str + newline += '\n' + + elif '// hls-fpga-machine-learning insert namespace-start' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += f'namespace {namespace} {{\n' + + elif '// hls-fpga-machine-learning insert namespace-end' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += '}\n' + + elif '// hls-fpga-machine-learning insert load weights' in line: + newline = line + if model.config.get_writer_config()['WriteWeightsTxt']: + + newline += '#ifndef __SYNTHESIS__\n' + newline += ' static bool loaded_weights = false;\n' + newline += ' if (!loaded_weights) {\n' + + for layer in model.get_layers(): + for w in layer.get_weights(): + if w.weight_class == 'CompressedWeightVariable': + newline += ( + indent + + ' nnet::load_compressed_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format( + w.type.name, w.nonzeros, w.name, w.name + ) + ) + elif w.weight_class == 'ExponentWeightVariable': + newline += ( + indent + + ' nnet::load_exponent_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format( + w.type.name, w.data_length, w.name, w.name + ) + ) + else: + newline += indent + ' nnet::load_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format( + w.type.name, w.data_length, w.name, w.name + ) + + newline += ' loaded_weights = true;\n' + newline += ' }\n' + newline += '#endif' + + # Add input/output type + elif '// hls-fpga-machine-learning insert IO' in line: + newline = '' + all_inputs = [i.name for i in model_inputs] + all_outputs = [o.name for o in model_outputs] + all_brams = [b.name for b in model_brams] + io_type = model.config.get_config_value('IOType') + + pipeline_style = model.config.pipeline_style + pipeline_ii = model.config.pipeline_ii + pipeline_pragma = indent + f'#pragma HLS function {pipeline_style}' + if pipeline_style == 'pipeline' and pipeline_ii is not None: + pipeline_pragma += f' II({pipeline_ii})\n' + else: + pipeline_pragma += '\n' + + if io_type == 'io_parallel': + for i in model_inputs: + newline += indent + self._make_array_pragma(i, is_argument=True) + '\n' + for o in model_outputs: + newline += indent + self._make_array_pragma(o, is_argument=True) + '\n' + # TODO Expose interface in a backend config + newline += indent + '#pragma HLS interface control type(simple)\n' + for input_name in all_inputs: + newline += indent + f'#pragma HLS interface argument({input_name}) type(simple)\n' + for output_name in all_outputs: + newline += indent + f'#pragma HLS interface argument({output_name}) type(simple)\n' + newline += pipeline_pragma + + if io_type == 'io_stream': + newline += indent + '#pragma HLS interface control type(axi_target)\n' + newline += indent + '#pragma HLS interface default type(axi_target)' + for bram_name in all_brams: + newline += indent + f'#pragma HLS interface argument({bram_name}) dma(true)\n' + newline += pipeline_pragma + + elif '// hls-fpga-machine-learning insert layers' in line: + newline = line + '\n' + for layer in model.get_layers(): + vars = layer.get_variables() + for var in vars: + if var not in model_inputs and var not in model_outputs: + def_cpp = var.definition_cpp() + if def_cpp is not None: + if var.pragma: + newline += ' ' + self._make_array_pragma(var) + '\n' + newline += ' ' + def_cpp + ';\n' + func = layer.get_attr('function_cpp', None) + if func: + if not isinstance(func, (list, set)): + func = [func] + if len(func) == 1: + newline += ' ' + func[0] + ' // ' + layer.name + '\n' + else: + newline += ' // ' + layer.name + '\n' + for line in func: + newline += ' ' + line + '\n' + if model.config.trace_output and layer.get_attr('trace', False): + newline += '#ifndef __SYNTHESIS__\n' + for var in vars: + newline += ' nnet::save_layer_output<{}>({}, "{}", {});\n'.format( + var.type.name, var.name, layer.name, var.size_cpp() + ) + newline += '#endif\n' + newline += '\n' + + # Just copy line + else: + newline = line + dst.write(newline) + + def write_project_header(self, model): + """Write the main architecture header file (myproject.h) + + Args: + model (ModelGraph): the hls4ml model. + """ + + filedir = Path(__file__).parent + prj_name = model.config.get_project_name() + prj_h_src = (filedir / '../templates/libero/firmware/myproject.h').resolve() + prj_h_dst = Path(f'{model.config.get_output_dir()}/firmware/{prj_name}.h').resolve() + + model_inputs = model.get_input_variables() + model_outputs = model.get_output_variables() + model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + + indent = ' ' + + with open(prj_h_src) as src, open(prj_h_dst, 'w') as dst: + for line in src.readlines(): + if 'MYPROJECT' in line: + newline = line.replace('MYPROJECT', format(prj_name.upper())) + + elif 'myproject' in line: + newline = line.replace('myproject', prj_name) + + elif '// hls-fpga-machine-learning insert header' in line: + inputs_str = ', '.join([i.definition_cpp(as_reference=True) for i in model_inputs]) + outputs_str = ', '.join([o.definition_cpp(as_reference=True) for o in model_outputs]) + brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams]) + + newline = '' + newline += indent + inputs_str + ',\n' + newline += indent + outputs_str + if len(model_brams) > 0: + newline += ',\n' + brams_str + newline += '\n' + + elif '// hls-fpga-machine-learning insert namespace-start' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += f'namespace {namespace} {{\n' + + elif '// hls-fpga-machine-learning insert namespace-end' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += '}\n' + + else: + newline = line + dst.write(newline) + + def write_defines(self, model): + """Write the C++ type definitions file (defines.h) + + Args: + model (ModelGraph): the hls4ml model. + """ + filedir = Path(__file__).parent + defines_src = (filedir / '../templates/libero/firmware/defines.h').resolve() + defines_dst = Path(f'{model.config.get_output_dir()}/firmware/defines.h').resolve() + + with open(defines_src) as src, open(defines_dst, 'w') as dst: + for line in src.readlines(): + # Insert numbers + if '// hls-fpga-machine-learning insert numbers' in line: + newline = line + + defines_list = [] + for layer in model.get_layers(): + defines = '' + for k, v in layer.get_output_variable().get_shape(): + defines += f'#define {k} {v}\n' + + defines_list.append(defines) + + newline += ''.join(defines_list) + + elif '// hls-fpga-machine-learning insert layer-precision' in line: + newline = line + all_precision = OrderedDict() + for layer in model.get_layers(): + layer_precision = layer.get_layer_precision() + for type_name, type_var in layer_precision.items(): + # Ensure that layer's types doesn't override existing types + # This can happen in case of InplaceVariable types + if type_name not in all_precision: + all_precision[type_name] = type_var + for used_type in all_precision.values(): + newline += used_type.definition_cpp() + + elif '// hls-fpga-machine-learning insert namespace-start' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += f'namespace {namespace} {{\n' + + elif '// hls-fpga-machine-learning insert namespace-end' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += '}\n' + + else: + newline = line + dst.write(newline) + + def write_parameters(self, model): + """Write the C++ layer config file (parameters.h) + + Args: + model (ModelGraph): the hls4ml model. + """ + filedir = Path(__file__).parent + params_src = (filedir / '../templates/libero/firmware/parameters.h').resolve() + params_dst = Path(f'{model.config.get_output_dir()}/firmware/parameters.h').resolve() + + with open(params_src) as src, open(params_dst, 'w') as dst: + for line in src.readlines(): + if '// hls-fpga-machine-learning insert includes' in line: + newline = line + for include in sorted( + set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), [])) + ): + newline += '#include "%s"\n' % include + + elif '// hls-fpga-machine-learning insert weights' in line: + newline = line + for layer in model.get_layers(): + for w in layer.get_weights(): + if w.storage.lower() != 'bram': + newline += f'#include "weights/{w.name}.h"\n' + + elif "// hls-fpga-machine-learning insert layer-config" in line: + newline = line + for layer in model.get_layers(): + config = layer.get_attr('config_cpp', None) + if config: + newline += '// ' + layer.name + '\n' + newline += config + '\n' + + elif '// hls-fpga-machine-learning insert namespace-start' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += f'namespace {namespace} {{\n' + + elif '// hls-fpga-machine-learning insert namespace-end' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += '}\n' + + else: + newline = line + dst.write(newline) + + def write_weights(self, model): + """Write the weights into header files + + Args: + model (ModelGraph): the hls4ml model. + """ + namespace = model.config.get_writer_config().get('Namespace', None) + write_txt = model.config.get_writer_config().get('WriteWeightsTxt', True) + for layer in model.get_layers(): + for weights in layer.get_weights(): + self.print_array_to_cpp( + weights, model.config.get_output_dir(), namespace=namespace, write_txt_file=write_txt + ) + + def __make_dat_file(self, original_path, project_path): + """ + Convert other input/output data types into a dat file, which is + a text file with the falttened matrix printed out. Note that ' ' is + assumed to be the delimiter. + """ + + # Take in data from current supported data files + if original_path[-3:] == "npy": + data = np.load(original_path) + else: + raise Exception("Unsupported input/output data files.") + + # Faltten data, just keep first dimension + data = data.reshape(data.shape[0], -1) + + def print_data(f): + for i in range(data.shape[0]): + for j in range(data.shape[1]): + f.write(str(data[i][j]) + " ") + f.write("\n") + + # Print out in dat file + with open(project_path, "w") as f: + print_data(f) + + def write_test_bench(self, model): + """Write the testbench files (myproject_test.cpp and input/output .dat files) + + Args: + model (ModelGraph): the hls4ml model. + """ + + filedir = Path(__file__).parent + prj_name = model.config.get_project_name() + out_dir = model.config.get_output_dir() + + tb_data_dir = Path(f'{out_dir}/tb_data/').resolve() + tb_data_dir.mkdir(parents=True, exist_ok=True) + + input_data = model.config.get_config_value('InputData') + output_predictions = model.config.get_config_value('OutputPredictions') + + if input_data: + if input_data[-3:] == 'dat': + copyfile(input_data, f'{out_dir}/tb_data/tb_input_features.dat') + else: + self.__make_dat_file(input_data, f'{out_dir}/tb_data/tb_input_features.dat') + + if output_predictions: + if output_predictions[-3:] == 'dat': + copyfile(output_predictions, f'{out_dir}/tb_data/tb_output_predictions.dat') + else: + self.__make_dat_file(output_predictions, f'{out_dir}/tb_data/tb_output_predictions.dat') + + tb_src = (filedir / '../templates/libero/myproject_test.cpp').resolve() + tb_dst = Path(f'{out_dir}/{prj_name}_test.cpp').resolve() + + model_inputs = model.get_input_variables() + model_outputs = model.get_output_variables() + model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + + with open(tb_src) as src, open(tb_dst, 'w') as dst: + for line in src.readlines(): + indent = ' ' * (len(line) - len(line.lstrip(' '))) + + # Insert numbers + if 'myproject' in line: + newline = line.replace('myproject', model.config.get_project_name()) + + elif '// hls-fpga-machine-learning insert bram' in line: + newline = line + for bram in model_brams: + newline += f'#include \"firmware/weights/{bram.name}.h\"\n' + + elif '// hls-fpga-machine-learning insert data' in line: + newline = line + offset = 0 + for inp in model_inputs: + newline += ' ' + inp.definition_cpp() + ';\n' + newline += ' nnet::copy_data(in, {});\n'.format( + inp.type.name, offset, inp.size_cpp(), inp.name + ) + offset += inp.size() + for out in model_outputs: + newline += ' ' + out.definition_cpp() + ';\n' + + elif '// hls-fpga-machine-learning insert zero' in line: + newline = line + for inp in model_inputs: + newline += indent + inp.definition_cpp() + ';\n' + newline += indent + f'nnet::fill_zero<{inp.type.name}, {inp.size_cpp()}>({inp.name});\n' + for out in model_outputs: + newline += indent + out.definition_cpp() + ';\n' + + elif '// hls-fpga-machine-learning insert top-level-function' in line: + newline = line + + input_vars = ','.join([i.name for i in model_inputs]) + output_vars = ','.join([o.name for o in model_outputs]) + bram_vars = ','.join([b.name for b in model_brams]) + + # Concatenate the input, output, and bram variables. Filter out empty/null values + all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars])) + + top_level = indent + f'{model.config.get_project_name()}({all_vars});\n' + + newline += top_level + + elif '// hls-fpga-machine-learning insert predictions' in line: + newline = line + for out in model_outputs: + newline += indent + f'for(int i = 0; i < {out.size_cpp()}; i++) {{\n' + newline += indent + ' std::cout << pr[i] << " ";\n' + newline += indent + '}\n' + newline += indent + 'std::cout << std::endl;\n' + + elif '// hls-fpga-machine-learning insert tb-output' in line: + newline = line + for out in model_outputs: + newline += indent + 'nnet::print_result<{}, {}>({}, fout);\n'.format( + out.type.name, out.size_cpp(), out.name + ) # TODO enable this + + elif ( + '// hls-fpga-machine-learning insert output' in line + or '// hls-fpga-machine-learning insert quantized' in line + ): + newline = line + for out in model_outputs: + newline += indent + 'nnet::print_result<{}, {}>({}, std::cout, true);\n'.format( + out.type.name, out.size_cpp(), out.name + ) + + elif '// hls-fpga-machine-learning insert namespace' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += indent + f'using namespace {namespace};\n' + + else: + newline = line + dst.write(newline) + + def write_bridge(self, model): + """Write the Python-C++ bridge (myproject_bridge.cpp) + + Args: + model (ModelGraph): the hls4ml model. + """ + + filedir = Path(__file__).parent + prj_name = model.config.get_project_name() + bridge_src = (filedir / '../templates/libero/myproject_bridge.cpp').resolve() + bridge_dst = Path(f'{model.config.get_output_dir()}/{prj_name}_bridge.cpp').resolve() + + model_inputs = model.get_input_variables() + model_outputs = model.get_output_variables() + model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + + indent = ' ' + + with open(bridge_src) as src, open(bridge_dst, 'w') as dst: + for line in src.readlines(): + if 'MYPROJECT' in line: + newline = line.replace('MYPROJECT', prj_name.upper()) + + elif 'myproject' in line: + newline = line.replace('myproject', prj_name) + + elif '// hls-fpga-machine-learning insert bram' in line: + newline = line + for bram in model_brams: + newline += f'#include \"firmware/weights/{bram.name}.h\"\n' + + elif '// hls-fpga-machine-learning insert header' in line: + dtype = line.split('#', 1)[1].strip() + inputs_str = ', '.join([f'{dtype} {i.name}[{i.size_cpp()}]' for i in model_inputs]) + outputs_str = ', '.join([f'{dtype} {o.name}[{o.size_cpp()}]' for o in model_outputs]) + + newline = '' + newline += indent + inputs_str + ',\n' + newline += indent + outputs_str + '\n' + + elif '// hls-fpga-machine-learning insert wrapper' in line: + dtype = line.split('#', 1)[1].strip() + newline = '' + for i in model_inputs: + def_cpp = i.definition_cpp(name_suffix='_ap') + vname = i.name + tname = i.type.name + size = i.size_cpp() + newline += indent + f'{def_cpp};\n' + newline += indent + f'nnet::convert_data<{dtype}, {tname}, {size}>({vname}, {vname}_ap);\n' + newline += '\n' + + for o in model_outputs: + def_cpp = o.definition_cpp(name_suffix='_ap') + newline += indent + f'{def_cpp};\n' + + newline += '\n' + + input_vars = ','.join([i.name + '_ap' for i in model_inputs]) + bram_vars = ','.join([b.name for b in model_brams]) + output_vars = ','.join([o.name + '_ap' for o in model_outputs]) + + # Concatenate the input, output, and bram variables. Filter out empty/null values + all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars])) + + top_level = indent + f'{prj_name}({all_vars});\n' + newline += top_level + + newline += '\n' + + for o in model_outputs: + vname = o.name + tname = o.type.name + size = o.size_cpp() + newline += indent + f'nnet::convert_data<{tname}, {dtype}, {size}>({vname}_ap, {vname});\n' + + elif '// hls-fpga-machine-learning insert trace_outputs' in line: + newline = '' + for layer in model.get_layers(): + func = layer.get_attr('function_cpp', None) + if func and model.config.trace_output and layer.get_attr('trace', False): + vars = layer.get_variables() + for var in vars: + newline += ( + indent + + 'nnet::trace_outputs->insert(std::pair(' + + f'"{layer.name}", (void *) malloc({var.size_cpp()} * element_size)));\n' + ) + + elif '// hls-fpga-machine-learning insert namespace' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += indent + f'using namespace {namespace};\n' + + else: + newline = line + dst.write(newline) + + def write_build_script(self, model): + """Write the TCL/Shell build scripts (config.tcl, Makefile, build_lib.sh) + + Args: + model (ModelGraph): the hls4ml model. + """ + + filedir = Path(__file__).parent + prj_name = model.config.get_project_name() + + # project.tcl + cfg_tcl_dst = Path(f'{model.config.get_output_dir()}/config.tcl') + with open(cfg_tcl_dst, 'w') as f: + f.write('source $env(SHLS_ROOT_DIR)/examples/legup.tcl\n') + fpga_family = model.config.get_config_value('FPGAFamily') + fpga_part = model.config.get_config_value('Part') + board = model.config.get_config_value('Board') + clock = model.config.get_config_value('ClockPeriod') + f.write(f'set_project {fpga_family} {fpga_part} {board}\n') + f.write(f'set_parameter CLOCK_PERIOD {clock}\n') + + # Makefile + makefile_dst = Path(f'{model.config.get_output_dir()}/Makefile') + with open(makefile_dst, 'w') as f: + f.write(f'NAME = {prj_name}\n') + f.write('LOCAL_CONFIG = -legup-config=config.tcl\n') + f.write(f'SRCS = firmware/{prj_name}.cpp {prj_name}_test.cpp \n') + # Not sure if this is required, it is present in both GUI- and CLI-generated projects + f.write('LEVEL = $(SHLS_ROOT_DIR)/examples\n') + # This must be the last line + f.write('include $(LEVEL)/Makefile.common\n') + + # build_lib.sh + build_lib_src = (filedir / '../templates/libero/build_lib.sh').resolve() + build_lib_dst = Path(f'{model.config.get_output_dir()}/build_lib.sh').resolve() + with open(build_lib_src) as src, open(build_lib_dst, 'w') as dst: + for line in src.readlines(): + line = line.replace('myproject', prj_name) + line = line.replace('mystamp', model.config.get_config_value('Stamp')) + + dst.write(line) + build_lib_dst.chmod(build_lib_dst.stat().st_mode | stat.S_IEXEC) + + def write_nnet_utils(self, model): + """Copy the nnet_utils, AP types headers and any custom source to the project output directory + + Args: + model (ModelGraph): the hls4ml model. + """ + + # nnet_utils + filedir = Path(__file__).parent + out_dir = model.config.get_output_dir() + + srcpath = (filedir / '../templates/libero/nnet_utils/').resolve() + dstpath = Path(f'{out_dir}/firmware/nnet_utils/').resolve() + dstpath.mkdir(parents=True, exist_ok=True) + + headers = [Path(h).name for h in glob.glob(str(srcpath / '*.h'))] + + for h in headers: + copyfile(srcpath / h, dstpath / h) + + # custom source + custom_source = model.config.backend.get_custom_source() + for dst, srcpath in custom_source.items(): + dstpath = Path(f'{out_dir}/firmware/{dst}') + copyfile(srcpath, dstpath) + + def write_generated_code(self, model): + """Write the generated code (nnet_code_gen.h) + + Args: + model (ModelGraph): the hls4ml model. + """ + codegen_path = Path(f'{model.config.get_output_dir()}/firmware/nnet_utils/nnet_code_gen.h') + with open(codegen_path) as src: + contents = src.readlines() + with open(codegen_path, 'w') as dst: + namespace = model.config.get_writer_config().get('Namespace', None) + + for line in contents: + if '// hls4ml insert code' in line: + newline = line + for layer in model.get_layers(): + for generated_code in layer.code.values(): + newline += str(generated_code) + else: + newline = line + if namespace is not None: + if 'namespace nnet' in newline: + newline = newline.replace('namespace nnet', f'namespace {namespace}') + dst.write(newline) + + def write_yml(self, model): + """Write the config to the YAML file + + Args: + model (ModelGraph): the hls4ml model. + """ + + def keras_model_representer(dumper, keras_model): + model_path = model.config.get_output_dir() + '/keras_model.keras' + keras_model.save(model_path) + return dumper.represent_scalar('!keras_model', model_path) + + try: + import keras + + KerasModel = keras.models.Model + + yaml.add_multi_representer(KerasModel, keras_model_representer) + except Exception: + pass + + with open(model.config.get_output_dir() + '/' + config_filename, 'w') as file: + yaml.dump(model.config.config, file) + + def write_tar(self, model): + """Write the generated project as a .tar.gz archive + + Args: + model (ModelGraph): the hls4ml model. + """ + + write_tar = model.config.get_writer_config().get('WriteTar', False) + if write_tar: + tar_path = Path(model.config.get_output_dir() + '.tar.gz') + tar_path.unlink(missing_ok=True) + with tarfile.open(tar_path, mode='w:gz') as archive: + archive.add(model.config.get_output_dir(), recursive=True, arcname='') + + def write_hls(self, model): + print('Writing HLS project') + self.write_project_dir(model) + self.write_project_cpp(model) + self.write_project_header(model) + self.write_weights(model) + self.write_defines(model) + self.write_parameters(model) + self.write_test_bench(model) + self.write_bridge(model) + self.write_build_script(model) + self.write_nnet_utils(model) + self.write_generated_code(model) + self.write_yml(model) + self.write_tar(model) + print('Done')