続けて間引き処理をカーネルに追加していく。
フレームを4分割したサブフレームでの最大値を探索して、最後に結果を出力する。
メモリアクセスのコスト下げるためにコーナースコアは最終ワードにまとめた。
気休めだと思うけどレジスタに配置されるよう個別に宣言して全てif文で比較する。
カーネルの探索範囲を40x40の領域に限定すると32bitに全ての情報載せられそうだけど、改善は微々たるものと思うのでとりあえず保留。
カーネル部分のコードは以下の通り。
#include "xf_fast_and_dist_config.h"
template <int PW, int SRC_T, int ROWS, int COLS, int NPC>
void fastResult2Vec(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& in_img, ap_uint<32>* vec_out){
int read_index = 0;
ap_uint<32> vec0 = 0;
ap_uint<8> vec_resp0 = 0;
ap_uint<32> vec1 = 0;
ap_uint<8> vec_resp1 = 0;
ap_uint<32> vec2 = 0;
ap_uint<8> vec_resp2 = 0;
ap_uint<32> vec3 = 0;
ap_uint<8> vec_resp3 = 0;
for (int row = 0; row < in_img.rows; row++) {
#pragma HLS PIPELINE
for (int col = 0; col < in_img.cols; col++) {
ap_uint<8> temp = in_img.read(read_index++);
if(temp != 0){
if ((col <= COLS/2) && (row <= ROWS/2) && (temp > vec0)){vec0 = 0x80000000 + (row << 15) + col; vec_resp0 = temp;}
if ((col > COLS/2) && (row <= ROWS/2) && (temp > vec1)){vec1 = 0x80000000 + (row << 15) + col; vec_resp1 = temp;}
if ((col <= COLS/2) && (row > ROWS/2) && (temp > vec2)){vec2 = 0x80000000 + (row << 15) + col; vec_resp2 = temp;}
if ((col > COLS/2) && (row > ROWS/2) && (temp > vec3)){vec3 = 0x80000000 + (row << 15) + col; vec_resp3 = temp;}
}
}
}
vec_out[0] = vec0;
vec_out[1] = vec1;
vec_out[2] = vec2;
vec_out[3] = vec3;
vec_out[4] = (vec_resp3 << 24) + (vec_resp2 << 16) + (vec_resp1 << 8) + vec_resp0;
return;
}
extern "C" {
void fast_and_dist_accel( ap_uint<PTR_WIDTH>* img_in, unsigned char threshold, /*ap_uint<PTR_WIDTH>* img_out,*/ int rows, int cols,
ap_uint<32>* vec_out) {
// clang-format off
#pragma HLS INTERFACE m_axi port=img_in offset=slave bundle=gmem0
// #pragma HLS INTERFACE m_axi port=img_out offset=slave bundle=gmem1
#pragma HLS INTERFACE m_axi port=vec_out offset=slave bundle=gmem1
#pragma HLS INTERFACE s_axilite port=rows
#pragma HLS INTERFACE s_axilite port=cols
#pragma HLS INTERFACE s_axilite port=threshold
#pragma HLS INTERFACE s_axilite port=return
// clang-format on
xf::cv::Mat<TYPE, HEIGHT, WIDTH, NPC1> imgInput(rows, cols);
xf::cv::Mat<TYPE, HEIGHT, WIDTH, NPC1> imgOutput(rows, cols);
// clang-format off
#pragma HLS DATAFLOW
// clang-format on
// Retrieve xf::cv::Mat objects from img_in data:
xf::cv::Array2xfMat<PTR_WIDTH, TYPE, HEIGHT, WIDTH, NPC1>(img_in, imgInput);
// Run xfOpenCV kernel:
xf::cv::fast<NMS, TYPE, HEIGHT, WIDTH, NPC1>(imgInput, imgOutput, threshold);
// Convert _dst xf::cv::Mat object to output array:
//xf::cv::xfMat2Array<PTR_WIDTH, TYPE, HEIGHT, WIDTH, NPC1>(imgOutput, img_out);
// Convert _dst xf::cv::Mat objecto to output vector
fastResult2Vec<PTR_WIDTH, TYPE, HEIGHT, WIDTH, NPC1>(imgOutput, vec_out);
return;
} // End of kernel
} // End of extern C
ビルドは以下の通り。hashはfcf6cdc2661a448b21190a7dd9b710e9b507f545
git clone https://github.com/akira-nishiyama/orb_slam_2_ros.git -b feature-fpga
cd orb_slam_2_ros
mkdir build
cd build
source <vitis-installation-path>/settings64.sh
source <sdk-installation-path>/environment-setup-aarch64-xilinx-linux
export AMENT_PREFIX_PATH=$OECORE_NATIVE_SYSROOT/usr:$OECORE_TARGET_SYSROOT/usr
export PYTHONPATH=$OECORE_NATIVE_SYSROOT/usr/lib/python3.7/site-packages:$OECORE_TARGET_SYSROOT/usr/lib/python3.7/site-packages
cmake .. -DCMAKE_INSTALL_SO_NO_EXE=0 -DBUILD_TESTING=OFF -DCMAKE_NO_SYSTEM_FROM_IMPORTED=1 -GNinja -DCMAKE_INSTALL_PREFIX=<path-to-sd-root> -DPLATFORM_COMPONENTS_PATH=<path-to-components> -DBUILD_TEST_TARGETS=OFF -DBUILD_TARGET=hw -DCMAKE_BUILD_TYPE=Debug -DENABLE_KRNL_CHECK=OFF -DBUILD_FAST_TEST=OFF -DBUILD_FAST_AND_DIST_TEST=ON -DDISABLE_RESIZE_KRNL=ON -DDISABLE_FAST_KRNL=OFF
実行結果は次の通り。
時間はそこそこ短くなったけど全然及ばないなぁ。
hostプログラムのclone処理が悪いと思うが。。。
root@ultra96v2-zynqmp:~# time ./orb_slam2_ros_mono_tum_test /usr/share/orb_slam2_ros/orb_slam2/Vocabulary/ORBvoc.txt /home/root/tum-dataset
ORB-SLAM2 Copyright (C) 2014-2016 Raul Mur-Artal, University of Zaragoza.
This program comes with ABSOLUTELY NO WARRANTY;
This is free software, and you are welcome to redistribute it
under certain conditions. See LICENSE.txt.
OpenCV version : 3.4.3
Major version : 3
Minor version : 4
Subminor version : 3
Input sensor was set to: Monocular
Loading ORB Vocabulary.
Vocabulary loaded!
Camera Parameters:
- fx: 517.306
- fy: 516.469
- cx: 318.643
- cy: 255.314
- k1: 0.262383
- k2: -0.953104
- k3: 0.002628
- p1: 1.16331
- p2: -0.005358
- fps: 30
- bf: 0
- color order: RGB (ignored if grayscale)
Found Platform
Platform Name: Xilinx
INFO: Device found - edge
Loading: '/usr/bin/xclbin/resize_krnl.xclbin'
file size:5625745
actual read size:5625745
program xclbin.
program xclbin Done.
create resize_accel kernel.
create resize_accel kernel Done.
Found Platform
Platform Name: Xilinx
INFO: Device found - edge
Loading: '/usr/bin/xclbin/resize_krnl.xclbin'
file size:5625745
actual read size:5625745
program xclbin.
program xclbin Done.
create resize_accel kernel.
create resize_accel kernel Done.
ORB Extractor Parameters:
- Number of Features: 1000
- Scale Levels: 8
- Scale Factor: 1.2
- Initial Fast Threshold: 20
- Minimum Fast Threshold: 7
-------
Start processing sequence ...
Images in the sequence: 798
-------
median tracking time: 0.188161
mean tracking time: 0.188687
Saving keyframe trajectory to KeyFrameTrajectory.txt ...
trajectory saved!
real 2m56.512s
user 2m35.888s
sys 0m55.475s
0 件のコメント:
コメントを投稿