Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Analytical FIFO sizing #1185

Open
wants to merge 8 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions src/finn/builder/build_dataflow_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,16 @@

class AutoFIFOSizingMethod(str, Enum):
"Select the type of automatic FIFO sizing strategy."

CHARACTERIZE = "characterize"
LARGEFIFO_RTLSIM = "largefifo_rtlsim"


class FIFOCharacterizationMethod(str, Enum):
"Select the strategy for characteristic sizing of FIFOs."
CHARACTERIZE_RTLSIM = "rtlsim"
CHARACTERIZE_ANALYTICAL = "analytical"


class ShellFlowType(str, Enum):
"""For builds that produce a bitfile, select the shell flow that will integrate
the FINN-generated accelerator."""
Expand Down Expand Up @@ -116,9 +121,9 @@ class VerificationStepType(str, Enum):
"step_apply_folding_config",
"step_minimize_bit_width",
"step_generate_estimate_reports",
"step_set_fifo_depths",
"step_hw_codegen",
"step_hw_ipgen",
"step_set_fifo_depths",
"step_create_stitched_ip",
"step_measure_rtlsim_performance",
"step_out_of_context_synthesis",
Expand Down Expand Up @@ -273,6 +278,15 @@ class DataflowBuildConfig:
#: setting the FIFO sizes.
auto_fifo_strategy: Optional[AutoFIFOSizingMethod] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM

#: Which strategy will be used for characteristic function-based FIFO sizing.
#: CHARACTERIZE_RTLSIM will result in performing RTLSIM for each node
#: to deduce the characteristic functions empirically
#: CHARACTERIZE_ANALYTICAL will use analytical functions if available, avoiding the generation
#: of IP cores.
characteristic_function_strategy: Optional[
FIFOCharacterizationMethod
] = FIFOCharacterizationMethod.CHARACTERIZE_RTLSIM

#: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test
#: if set to True, always using Python instead
force_python_rtlsim: Optional[bool] = False
Expand Down
21 changes: 13 additions & 8 deletions src/finn/builder/build_dataflow_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,14 +553,18 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
model = model.transform(InsertDWC())
model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
model = model.transform(GiveUniqueNodeNames())
model = model.transform(AnnotateCycles())

period = int(model.analysis(dataflow_performance)["max_cycles"] * 3 + 10)
model = model.transform(
PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
DeriveCharacteristic(
model,
period,
cfg.characteristic_function_strategy,
cfg._resolve_fpga_part(),
cfg._resolve_hls_clk_period(),
)
)
model = model.transform(HLSSynthIP())
model = model.transform(PrepareRTLSim())
model = model.transform(AnnotateCycles())
period = model.analysis(dataflow_performance)["max_cycles"] + 10
model = model.transform(DeriveCharacteristic(period))
model = model.transform(DeriveFIFOSizes())
model = model.transform(
InsertFIFO(
Expand Down Expand Up @@ -623,6 +627,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
"depth_trigger_uram",
"depth_trigger_bram",
]

extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs)

# perform FIFO splitting and shallow FIFO removal only after the final config
Expand All @@ -634,8 +639,8 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):

# after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
# this will only run for the new nodes (e.g. FIFOs and DWCs)
model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
model = model.transform(HLSSynthIP())
# model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
# model = model.transform(HLSSynthIP())
return model


Expand Down
38 changes: 38 additions & 0 deletions src/finn/custom_op/fpgadataflow/channelwise_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,3 +232,41 @@ def execute_node(self, context, graph):
sess = rt.InferenceSession(model_func.SerializeToString())
result = sess.run(None, idict)
context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)

def prepare_kwargs_for_characteristic_fx(self):
# key parameters
PE = self.get_nodeattr("PE")
NumChannels = self.get_nodeattr("NumChannels")
NF = int(NumChannels / PE)
dim = np.prod(self.get_folded_output_shape()[1:-1])
# assert True == False
kwargs = (NF, dim)

# assert True==False

return kwargs

def characteristic_fx_input(self, txns, cycles, counter, kwargs):
# Compute one period of the input characteristic function

(NF, dim) = kwargs

for k in range(dim):
txns.append(counter)
counter += 1
cycles += 1

#
return txns, cycles, counter

def characteristic_fx_output(self, txns, cycles, counter, kwargs):
# Compute one period of the output characteristic function

(NF, dim) = kwargs

for k in range(dim):
txns.append(counter)
counter += 1
cycles += 1

return txns, cycles, counter
240 changes: 240 additions & 0 deletions src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,3 +277,243 @@ def execute_node(self, context, graph):
# this automatically updates the execution context
inst = getCustomOp(im2col_node)
inst.execute_node(context, model_im2col.graph)

def prepare_kwargs_for_characteristic_fx(self):
# key parameters
IFMDim_x = self.get_nodeattr("IFMDim")[0]
OFMDim_x = self.get_nodeattr("OFMDim")[0]
ConvKernelDim_x = self.get_nodeattr("ConvKernelDim")[0]
Stride_x = self.get_nodeattr("Stride")[0]

OFMDim_y = self.get_nodeattr("OFMDim")[1]
ConvKernelDim_y = self.get_nodeattr("ConvKernelDim")[1]
Stride_y = self.get_nodeattr("Stride")[1]

SIMD = self.get_nodeattr("SIMD")

IFMChannels = self.get_nodeattr("IFMChannels")

DEPTHWISE = self.get_nodeattr("depthwise")
is1d = self.get_nodeattr("is1D")
# m = self.get_nodeattr("m")
# flip = self.get_nodeattr("flip")

SIMD_COUNT = int(IFMChannels / SIMD)
OUTPUT_SIZE = OFMDim_x * ConvKernelDim_x * SIMD_COUNT
INPUT_SIZE = IFMDim_x * SIMD_COUNT
WINDOW_SIZE = ConvKernelDim_x * SIMD_COUNT
if DEPTHWISE:
BUFFER_SIZE = ConvKernelDim_x * SIMD_COUNT
READ_CYCLES = SIMD_COUNT * (ConvKernelDim_x - 1) - (ConvKernelDim_x - 1)
FINISH = IFMDim_x - ConvKernelDim_x - 2
else:
BUFFER_SIZE = (ConvKernelDim_x - 1) * SIMD_COUNT
READ_CYCLES = 0
FINISH = 0

OCNT_INITIAL = BUFFER_SIZE + (Stride_x - 1)

DEFAULT_FIFO_DEPTH = 2

multiplying_factor = int(IFMChannels / SIMD)
number_blocks = int(ConvKernelDim_y / Stride_y + 1)
cycles_write_block = OFMDim_x * ConvKernelDim_x * ConvKernelDim_y * multiplying_factor
cycles_read_block = Stride_x * IFMDim_x * multiplying_factor
max_cycles = max(cycles_write_block, cycles_read_block)
baseIter = IFMDim_x * ConvKernelDim_y * multiplying_factor + OFMDim_y * max(
cycles_write_block, cycles_read_block
)
initial_buffer = IFMDim_x * ConvKernelDim_y * multiplying_factor

READ_DELAY = (
number_blocks
* ConvKernelDim_x
* ConvKernelDim_y
* OFMDim_x
* OFMDim_y
* multiplying_factor
- ConvKernelDim_x * ConvKernelDim_y * OFMDim_x
)
READ_ITES = int((baseIter - OFMDim_y) / max(cycles_write_block, cycles_read_block))

# assert True == False
kwargs = (
SIMD_COUNT,
Stride_x,
Stride_y,
OUTPUT_SIZE,
INPUT_SIZE,
WINDOW_SIZE,
BUFFER_SIZE,
READ_CYCLES,
OCNT_INITIAL,
DEPTHWISE,
DEFAULT_FIFO_DEPTH,
is1d,
multiplying_factor,
number_blocks,
cycles_write_block,
cycles_read_block,
max_cycles,
baseIter,
initial_buffer,
FINISH,
OFMDim_y,
READ_DELAY,
READ_ITES,
)

# assert True==False

return kwargs

def characteristic_fx_input(self, txns, cycles, counter, kwargs):
# Compute one period of the input characteristic function

(
SIMD_COUNT,
Stride_x,
Stride_y,
OUTPUT_SIZE,
INPUT_SIZE,
WINDOW_SIZE,
BUFFER_SIZE,
READ_CYCLES,
OCNT_INITIAL,
DEPTHWISE,
DEFAULT_FIFO_DEPTH,
is1d,
multiplying_factor,
number_blocks,
cycles_write_block,
cycles_read_block,
max_cycles,
baseIter,
initial_buffer,
FINISH,
OFMDim_y,
READ_DELAY,
READ_ITES,
) = kwargs

if DEPTHWISE:
OCNT_MAX = BUFFER_SIZE
ocnt = SIMD_COUNT

else:
OCNT_MAX = WINDOW_SIZE
if OCNT_INITIAL < WINDOW_SIZE:
ocnt = OCNT_INITIAL
else:
ocnt = -1

# fifo filling
for i in range(0, DEFAULT_FIFO_DEPTH):
txns.append(counter)
counter += 1
cycles += 1

# main function

inp_count = 0

if is1d:
for i in range(0, OUTPUT_SIZE):
txns.append(counter)
we = (i < OCNT_MAX) or (ocnt < (SIMD_COUNT * Stride_x))
re = i > 0

if re:
ocnt += 1
if ocnt == OCNT_MAX:
ocnt = 0
if we:
if inp_count < INPUT_SIZE - DEFAULT_FIFO_DEPTH:
counter += 1
inp_count += 1

cycles += 1
else:
for i in range(0, initial_buffer + cycles_read_block - 1):
txns.append(counter)
cycles += 1
counter += 1

txns.append(counter)
cycles += 1 # one extra for loop tail

for i in range(0, OFMDim_y - 1):
for j in range(0, cycles_write_block - cycles_read_block):
txns.append(counter)
cycles += 1

for j in range(0, cycles_read_block - 1):
if i < OFMDim_y - 2:
counter += 1
txns.append(counter)
cycles += 1
# else:
# if j < FINISH:
# counter+=1
# txns.append(counter)
# cycles+=1
#
return txns, cycles, counter

def characteristic_fx_output(self, txns, cycles, counter, kwargs):
# Compute one period of the output characteristic function

(
SIMD_COUNT,
Stride_x,
Stride_y,
OUTPUT_SIZE,
INPUT_SIZE,
WINDOW_SIZE,
BUFFER_SIZE,
READ_CYCLES,
OCNT_INITIAL,
DEPTHWISE,
DEFAULT_FIFO_DEPTH,
is1d,
multiplying_factor,
number_blocks,
cycles_write_block,
cycles_read_block,
max_cycles,
baseIter,
initial_buffer,
FINISH,
OFMDim_y,
READ_DELAY,
READ_ITES,
) = kwargs

# HYPER PARAMETERS

INITIAL_LOOP_CYCLES = 5

if is1d:
for i in range(0, INITIAL_LOOP_CYCLES):
txns.append(counter)
cycles += 1

for i in range(0, READ_CYCLES):
txns.append(counter)
cycles += 1

for i in range(0, OUTPUT_SIZE):
txns.append(counter)
counter += 1
cycles += 1
else:
for i in range(0, initial_buffer + INITIAL_LOOP_CYCLES - 1):
txns.append(counter)
cycles += 1

for i in range(0, baseIter - initial_buffer):
txns.append(counter)
counter += 1
cycles += 1

return txns, cycles, counter
9 changes: 7 additions & 2 deletions src/finn/custom_op/fpgadataflow/duplicatestreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,17 @@ def get_verilog_top_module_intf_names(self):
)
return intf_names

def derive_characteristic_fxns(self, period):
def derive_characteristic_fxns(
self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None
):
n_inps = np.prod(self.get_folded_input_shape()[:-1])
io_dict = {
"inputs": {
"in0": [0 for i in range(n_inps)],
},
"outputs": {"out0": [], "out1": []},
}
super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)

super().derive_characteristic_fxns(
model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict
)
Loading