Akira's Study Room: ultra96v2でVitis 2020.2を使用してORB_SLAM2のFASTとDistirbuteOctTree処理をカーネル化していく話その2

続けて間引き処理をカーネルに追加していく。

フレームを4分割したサブフレームでの最大値を探索して、最後に結果を出力する。
メモリアクセスのコスト下げるためにコーナースコアは最終ワードにまとめた。
気休めだと思うけどレジスタに配置されるよう個別に宣言して全てif文で比較する。
カーネルの探索範囲を40x40の領域に限定すると32bitに全ての情報載せられそうだけど、改善は微々たるものと思うのでとりあえず保留。

カーネル部分のコードは以下の通り。

#include "xf_fast_and_dist_config.h"

template <int PW, int SRC_T, int ROWS, int COLS, int NPC>
void fastResult2Vec(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& in_img, ap_uint<32>* vec_out){

    int read_index = 0;
    ap_uint<32> vec0 = 0;
    ap_uint<8> vec_resp0 = 0;
    ap_uint<32> vec1 = 0;
    ap_uint<8> vec_resp1 = 0;
    ap_uint<32> vec2 = 0;
    ap_uint<8> vec_resp2 = 0;
    ap_uint<32> vec3 = 0;
    ap_uint<8> vec_resp3 = 0;

    for (int row = 0; row < in_img.rows; row++) {
#pragma HLS PIPELINE
        for (int col = 0; col < in_img.cols; col++) {
            ap_uint<8> temp = in_img.read(read_index++);
            if(temp != 0){
                if ((col <= COLS/2) && (row <= ROWS/2) && (temp > vec0)){vec0 = 0x80000000 + (row << 15) + col; vec_resp0 = temp;}
                if ((col >  COLS/2) && (row <= ROWS/2) && (temp > vec1)){vec1 = 0x80000000 + (row << 15) + col; vec_resp1 = temp;}
                if ((col <= COLS/2) && (row >  ROWS/2) && (temp > vec2)){vec2 = 0x80000000 + (row << 15) + col; vec_resp2 = temp;}
                if ((col >  COLS/2) && (row >  ROWS/2) && (temp > vec3)){vec3 = 0x80000000 + (row << 15) + col; vec_resp3 = temp;}
            }
        }
    }
    vec_out[0] = vec0;
    vec_out[1] = vec1;
    vec_out[2] = vec2;
    vec_out[3] = vec3;
    vec_out[4] = (vec_resp3 << 24) + (vec_resp2 << 16) + (vec_resp1 << 8) + vec_resp0;
    return;
}

extern "C" {

void fast_and_dist_accel(   ap_uint<PTR_WIDTH>* img_in, unsigned char threshold, /*ap_uint<PTR_WIDTH>* img_out,*/ int rows, int cols,
                            ap_uint<32>* vec_out) {
// clang-format off
    #pragma HLS INTERFACE m_axi      port=img_in        offset=slave  bundle=gmem0
    
//    #pragma HLS INTERFACE m_axi      port=img_out       offset=slave  bundle=gmem1
    #pragma HLS INTERFACE m_axi      port=vec_out       offset=slave  bundle=gmem1
    #pragma HLS INTERFACE s_axilite  port=rows
    #pragma HLS INTERFACE s_axilite  port=cols
    #pragma HLS INTERFACE s_axilite  port=threshold
    #pragma HLS INTERFACE s_axilite  port=return
    // clang-format on

    xf::cv::Mat<TYPE, HEIGHT, WIDTH, NPC1> imgInput(rows, cols);
    xf::cv::Mat<TYPE, HEIGHT, WIDTH, NPC1> imgOutput(rows, cols);
// clang-format off
    #pragma HLS DATAFLOW
    // clang-format on

    // Retrieve xf::cv::Mat objects from img_in data:
    xf::cv::Array2xfMat<PTR_WIDTH, TYPE, HEIGHT, WIDTH, NPC1>(img_in, imgInput);

    // Run xfOpenCV kernel:
    xf::cv::fast<NMS, TYPE, HEIGHT, WIDTH, NPC1>(imgInput, imgOutput, threshold);

    // Convert _dst xf::cv::Mat object to output array:
    //xf::cv::xfMat2Array<PTR_WIDTH, TYPE, HEIGHT, WIDTH, NPC1>(imgOutput, img_out);

    // Convert _dst xf::cv::Mat objecto to output vector
    fastResult2Vec<PTR_WIDTH, TYPE, HEIGHT, WIDTH, NPC1>(imgOutput, vec_out);

    return;
} // End of kernel

} // End of extern C

ビルドは以下の通り。hashはfcf6cdc2661a448b21190a7dd9b710e9b507f545

git clone https://github.com/akira-nishiyama/orb_slam_2_ros.git -b feature-fpga
cd orb_slam_2_ros
mkdir build
cd build
source <vitis-installation-path>/settings64.sh
source <sdk-installation-path>/environment-setup-aarch64-xilinx-linux
export AMENT_PREFIX_PATH=$OECORE_NATIVE_SYSROOT/usr:$OECORE_TARGET_SYSROOT/usr
export PYTHONPATH=$OECORE_NATIVE_SYSROOT/usr/lib/python3.7/site-packages:$OECORE_TARGET_SYSROOT/usr/lib/python3.7/site-packages
cmake .. -DCMAKE_INSTALL_SO_NO_EXE=0 -DBUILD_TESTING=OFF -DCMAKE_NO_SYSTEM_FROM_IMPORTED=1 -GNinja -DCMAKE_INSTALL_PREFIX=<path-to-sd-root> -DPLATFORM_COMPONENTS_PATH=<path-to-components> -DBUILD_TEST_TARGETS=OFF -DBUILD_TARGET=hw -DCMAKE_BUILD_TYPE=Debug -DENABLE_KRNL_CHECK=OFF -DBUILD_FAST_TEST=OFF -DBUILD_FAST_AND_DIST_TEST=ON -DDISABLE_RESIZE_KRNL=ON -DDISABLE_FAST_KRNL=OFF

実行結果は次の通り。
時間はそこそこ短くなったけど全然及ばないなぁ。
hostプログラムのclone処理が悪いと思うが。。。

root@ultra96v2-zynqmp:~# time ./orb_slam2_ros_mono_tum_test /usr/share/orb_slam2_ros/orb_slam2/Vocabulary/ORBvoc.txt /home/root/tum-dataset           

ORB-SLAM2 Copyright (C) 2014-2016 Raul Mur-Artal, University of Zaragoza.
This program comes with ABSOLUTELY NO WARRANTY;
This is free software, and you are welcome to redistribute it
under certain conditions. See LICENSE.txt.

OpenCV version : 3.4.3
Major version : 3
Minor version : 4
Subminor version : 3
Input sensor was set to: Monocular

Loading ORB Vocabulary.
Vocabulary loaded!


Camera Parameters: 
- fx: 517.306
- fy: 516.469
- cx: 318.643
- cy: 255.314
- k1: 0.262383
- k2: -0.953104
- k3: 0.002628
- p1: 1.16331
- p2: -0.005358
- fps: 30
- bf: 0
- color order: RGB (ignored if grayscale)
Found Platform
Platform Name: Xilinx
INFO: Device found - edge
Loading: '/usr/bin/xclbin/resize_krnl.xclbin'
file size:5625745
actual read size:5625745
program xclbin.
program xclbin Done.
create resize_accel kernel.
create resize_accel kernel Done.
Found Platform
Platform Name: Xilinx
INFO: Device found - edge
Loading: '/usr/bin/xclbin/resize_krnl.xclbin'
file size:5625745
actual read size:5625745
program xclbin.
program xclbin Done.
create resize_accel kernel.
create resize_accel kernel Done.

ORB Extractor Parameters: 
- Number of Features: 1000
- Scale Levels: 8
- Scale Factor: 1.2
- Initial Fast Threshold: 20
- Minimum Fast Threshold: 7

-------
Start processing sequence ...
Images in the sequence: 798

-------

median tracking time: 0.188161
mean tracking time: 0.188687

Saving keyframe trajectory to KeyFrameTrajectory.txt ...

trajectory saved!

real    2m56.512s
user    2m35.888s
sys 0m55.475s

Akira's Study Room

2021年5月14日金曜日

ultra96v2でVitis 2020.2を使用してORB_SLAM2のFASTとDistirbuteOctTree処理をカーネル化していく話その2

0 件のコメント:

コメントを投稿