8000 (5.x) Merge 4.x by asmorkalov · Pull Request #24486 · opencv/opencv · GitHub
[go: up one dir, main page]

Skip to content

(5.x) Merge 4.x #24486

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 45 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
1eb1d4c
Release OpenCV 3.4.20
asmorkalov Jun 27, 2023
49ee4d4
Merge pull request #23879 from asmorkalov/release_3.4.20
asmorkalov Jun 27, 2023
7f6c95f
Switch to version 3.4.20-dev
asmorkalov Jun 28, 2023
4761c28
Merge pull request #24168 from autoantwort:missing-include-3
autoantwort Aug 17, 2023
a287605
Clean up the Universal Intrinsic API.
hanliutong Oct 13, 2023
14b21f7
Ensure interarea algorithm usage in resize perfomance test.
asmorkalov Oct 17, 2023
7757759
Merge pull request #24417 from asmorkalov:as/resize_interarea_perf
asmorkalov Oct 17, 2023
014e848
Merge pull request #24367 from alexlyulkov:al/fixed-cumsum-inplace-flag
alexlyulkov Oct 18, 2023
c6fa993
Updated libade to 0.1.2d
mshabunin Oct 18, 2023
240b245
Merge pull request #24092 from Aser-Abdelfatah:GSoC_Support_GatherEle…
Aser-Abdelfatah Oct 18, 2023
2f1d529
Merge pull request #24422 from mshabunin:update-ade-012d
asmorkalov Oct 18, 2023
ba4d6c8
added detection & dispatching of some modern NEON instructions (NEON_…
vpisarev Oct 18, 2023
6e4280e
Merge pull request #24372 from Kumataro:fix24369
Kumataro Oct 19, 2023
f2ef81a
fp16 support for gather elements
fengyuentau Oct 19, 2023
5bffcdf
Merge pull request #24382 from sisakat:cuda-compile-multicore
sisakat Oct 19, 2023
a9664ab
Merge pull request #24427 from fengyuentau:gather_elements_fp16
asmorkalov Oct 19, 2023
c96f48e
Merge pull request #24412 from vrabaud:inter_area1
vrabaud Oct 19, 2023
fcdaaab
Unconditionally create SuperScale in BarcodeDetector to avoid null deref
vrabaud Oct 19, 2023
1aa4621
Merge pull request #24429 from vrabaud:inter_area1
asmorkalov Oct 20, 2023
b44cb33
Merge pull request #21066 from andrewerf:21052-openvino-native-onnx
andrewerf Oct 20, 2023
1c0ca41
Merge pull request #24371 from hanliutong:clean-up
asmorkalov Oct 20, 2023
996b6c3
Merge pull request #24425 from fengyuentau:fix_timvx_test
fengyuentau Oct 20, 2023
d142a79
Merge pull request #23929 from CNClareChen:4.x
CNClareChen Oct 20, 2023
c2f909f
Merge pull request #23894 from kallaballa:blobFromImagesWithParams
kallaballa Oct 20, 2023
3c9c964
Check the return value of avifDecoderSetIOMemory.
vrabaud Oct 23, 2023
099e002
Fixed wrong division operator in py_tutorials doc
COOLIRON2311 Oct 23, 2023
3429c27
Merge pull request #24438 from vrabaud:avif_check
asmorkalov Oct 24, 2023
8b47361
Merge pull request #24440 from COOLIRON2311:4.x
asmorkalov Oct 24, 2023
44c254c
Make sure AVIF decoder is destroyed in case of failure
vrabaud Oct 24, 2023
1fe0fc2
Merge pull request #24441 from vrabaud:avif_check
asmorkalov Oct 24, 2023
a3b3a58
Merge pull request #24322 from Abdurrahheem:ash/dev_einsum_ellips
Abdurrahheem Oct 24, 2023
38bc519
Merge pull request #24363 from cudawarped:videoio_ffmpeg_add_stream_e…
cudawarped Oct 25, 2023
cac1695
Update IPPICV binaries (20230919)
eplankin Oct 25, 2023
1911c63
fix: supress GCC13 warnings (#24434)
Kumataro Oct 26, 2023
52c33f4
Merge pull request #24451 from eplankin:3.4
alalek Oct 27, 2023
77a0ffc
Merge pull request #24461 from fengyuentau:tracker_vit_backend_target
fengyuentau Oct 27, 2023
617d7ff
Merge pull request #24454 from komakai:refactorObjcRange
komakai Oct 27, 2023
e9e6b1e
Merge pull request #24405 from kochanczyk:4.x
kochanczyk Oct 30, 2023
a3ebc0a
Fixed Python signatures in Doxygen documentation.
asmorkalov Oct 30, 2023
e202116
Merge remote-tracking branch 'upstream/3.4' into merge-3.4
alalek Oct 31, 2023
c91af16
Merge pull request #24409 from fengyuentau:norm_kernel
fengyuentau Nov 1, 2023
f530a24
Fix MinGW build issue due to obsensor
clee-ai Nov 1, 2023
2e49bf3
Merge pull request #24468 from asmorkalov:as/python_ctor_docs
asmorkalov Nov 2, 2023
7c9231f
Merge pull request #24478 from CCInc:mingw_fix
asmorkalov Nov 2, 2023
34f34f6
Merge branch 4.x
asmorkalov Nov 2, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
added detection & dispatching of some modern NEON instructions (NEON_…
…FP16, NEON_BF16) (#24420)

* added more or less cross-platform (based on POSIX signal() semantics) method to detect various NEON extensions, such as FP16 SIMD arithmetics, BF16 SIMD arithmetics, SIMD dotprod etc. It could be propagated to other instruction sets if necessary.

* hopefully fixed compile errors

* continue to fix CI

* another attempt to fix build on Linux aarch64

* * reverted to the original method to detect special arm neon instructions without signal()
* renamed FP16_SIMD & BF16_SIMD to NEON_FP16 and NEON_BF16, respectively

* removed extra whitespaces
  • Loading branch information
vpisarev authored Oct 18, 2023
commit ba4d6c859d21536f84e0328c16f4cc3e96bf3065
11 changes: 9 additions & 2 deletions cmake/OpenCVCompilerOptimizations.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,9 @@ if(X86 OR X86_64)
elseif(ARM OR AARCH64)
ocv_update(CPU_NEON_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon.cpp")
ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp")
ocv_update(CPU_NEON_DOTPROD_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_dotprod.cpp")
ocv_update(CPU_NEON_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon_fp16.cpp")
ocv_update(CPU_NEON_BF16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon_bf16.cpp")
ocv_update(CPU_NEON_DOTPROD_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon_dotprod.cpp")
if(NOT AARCH64)
ocv_update(CPU_KNOWN_OPTIMIZATIONS "VFPV3;NEON;FP16")
if(NOT MSVC)
Expand All @@ -343,12 +345,17 @@ elseif(ARM OR AARCH64)
endif()
ocv_update(CPU_FP16_IMPLIES "NEON")
else()
ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD")
ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD;NEON_FP16;NEON_BF16")
ocv_update(CPU_NEON_FLAGS_ON "")
ocv_update(CPU_FP16_IMPLIES "NEON")
ocv_update(CPU_NEON_DOTPROD_FLAGS_ON "-march=armv8.2-a+dotprod")
ocv_update(CPU_NEON_DOTPROD_IMPLIES "NEON")
ocv_update(CPU_NEON_FP16_FLAGS_ON "-march=armv8.2-a+fp16")
ocv_update(CPU_NEON_FP16_IMPLIES "NEON")
ocv_update(CPU_NEON_BF16_FLAGS_ON "-march=armv8.2-a+fp16+bf16")
ocv_update(CPU_NEON_BF16_IMPLIES "NEON")
set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}")
set(CPU_DISPATCH "NEON_FP16;NEON_BF16;NEON_DOTPROD" CACHE STRING "${HELP_CPU_DISPATCH}")
endif()
elseif(MIPS)
ocv_update(CPU_MSA_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_msa.cpp")
Expand Down
46 changes: 46 additions & 0 deletions cmake/checks/cpu_neon_bf16.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
#include <stdio.h>
#include "arm_neon.h"

/*#if defined __clang__
#pragma clang attribute push (__attribute__((target("bf16"))), apply_to=function)
#elif defined GCC
#pragma GCC push_options
#pragma GCC target("armv8.2-a", "bf16")
#endif*/
bfloat16x8_t vld1q_as_bf16(const float* src)
{
float32x4_t s0 = vld1q_f32(src), s1 = vld1q_f32(src + 4);
return vcombine_bf16(vcvt_bf16_f32(s0), vcvt_bf16_f32(s1));
}

void vprintreg(const char* name, const float32x4_t& r)
{
float data[4];
vst1q_f32(data, r);
printf("%s: (%.2f, %.2f, %.2f, %.2f)\n",
name, data[0], data[1], data[2], data[3]);
}

void test()
{
const float src1[] = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f };
const float src2[] = { 1.f, 3.f, 6.f, 10.f, 15.f, 21.f, 28.f, 36.f };
bfloat16x8_t s1 = vld1q_as_bf16(src1), s2 = vld1q_as_bf16(src2);
float32x4_t d = vbfdotq_f32(vdupq_n_f32(0.f), s1, s2);
vprintreg("(s1[0]*s2[0] + s1[1]*s2[1], ... s1[6]*s2[6] + s1[7]*s2[7])", d);
}
/*#if defined __clang__
#pragma clang attribute pop
#elif defined GCC
#pragma GCC pop_options
8000 #endif*/
#else
#error "BF16 is not supported"
#endif

int main()
{
test();
return 0;
}
File renamed without changes.
46 changes: 46 additions & 0 deletions cmake/checks/cpu_neon_fp16.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#include <stdio.h>

#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
#include "arm_neon.h"

float16x8_t vld1q_as_f16(const float* src)
{
float32x4_t s0 = vld1q_f32(src), s1 = vld1q_f32(src + 4);
return vcombine_f16(vcvt_f16_f32(s0), vcvt_f16_f32(s1));
}

void vprintreg(const char* name, const float16x8_t& r)
{
float data[8];
vst1q_f32(data, vcvt_f32_f16(vget_low_f16(r)));
vst1q_f32(data + 4, vcvt_f32_f16(vg 8000 et_high_f16(r)));
printf("%s: (%.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f)\n",
name, data[0], data[1], data[2], data[3],
data[4], data[5], data[6], data[7]);
}

void test()
{
const float src1[] = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f };
const float src2[] = { 1.f, 3.f, 6.f, 10.f, 15.f, 21.f, 28.f, 36.f };
float16x8_t s1 = vld1q_as_f16(src1), s2 = vld1q_as_f16(src2);
float16x8_t d = vsubq_f16(s1, s1);
d = vfmaq_laneq_f16(d, s1, s2, 0);
d = vfmaq_laneq_f16(d, s1, s2, 1);
d = vfmaq_laneq_f16(d, s1, s2, 2);
d = vfmaq_laneq_f16(d, s1, s2, 3);
d = vfmaq_laneq_f16(d, s1, s2, 4);
d = vfmaq_laneq_f16(d, s1, s2, 5);
d = vfmaq_laneq_f16(d, s1, s2, 6);
d = vfmaq_laneq_f16(d, s1, s2, 7);
vprintreg("s1*s2[0]+s1*s2[1] + ... + s1*s2[7]", d);
}
#else
#error "FP16 is not supported"
#endif

int main()
{
test();
return 0;
}
4 changes: 4 additions & 0 deletions modules/core/include/opencv2/core/cvdef.h
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,8 @@ namespace cv {

#define CV_CPU_NEON 100
#define CV_CPU_NEON_DOTPROD 101
#define CV_CPU_NEON_FP16 102
#define CV_CPU_NEON_BF16 103

#define CV_CPU_MSA 150

Expand Down Expand Up @@ -328,6 +330,8 @@ enum CpuFeatures {

CPU_NEON = 100,
CPU_NEON_DOTPROD = 101,
CPU_NEON_FP16 = 102,
CPU_NEON_BF16 = 103,

CPU_MSA = 150,

Expand Down
36 changes: 25 additions & 11 deletions modules/core/src/system.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,8 @@ struct HWFeatures

g_hwFeatureNames[CPU_NEON] = "NEON";
g_hwFeatureNames[CPU_NEON_DOTPROD] = "NEON_DOTPROD";
g_hwFeatureNames[CPU_NEON_FP16] = "NEON_FP16";
g_hwFeatureNames[CPU_NEON_BF16] = "NEON_BF16";

g_hwFeatureNames[CPU_VSX] = "VSX";
g_hwFeatureNames[CPU_VSX3] = "VSX3";
Expand Down Expand Up @@ -566,10 +568,15 @@ struct HWFeatures

while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t)
{
// see https://elixir.bootlin.com/linux/latest/source/arch/arm64/include/uapi/asm/hwcap.h
if (auxv.a_type == AT_HWCAP)
{
have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0;
break;
have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0; // HWCAP_ASIMDDP
have[CV_CPU_NEON_FP16] = (auxv.a_un.a_val & (1 << 10)) != 0; // HWCAP_ASIMDHP
}
else if (auxv.a_type == AT_HWCAP2)
{
have[CV_CPU_NEON_BF16] = (auxv.a_un.a_val & (1 << 14)) != 0; // HWCAP2_BF16
}
}

Expand Down Expand Up @@ -623,16 +630,23 @@ struct HWFeatures
have[CV_CPU_NEON] = true;
#endif
#if (defined __ARM_FP && (((__ARM_FP & 0x2) != 0) && defined __ARM_NEON__))
have[CV_CPU_FP16] = true;
#endif
#if (defined __ARM_FEATURE_DOTPROD)
int has_feat_dotprod = 0;
size_t has_feat_dotprod_size = sizeof(has_feat_dotprod);
sysctlbyname("hw.optional.arm.FEAT_DotProd", &has_feat_dotprod, &has_feat_dotprod_size, NULL, 0);
if (has_feat_dotprod) {
have[CV_CPU_NEON_DOTPROD] = true;
}
have[CV_CPU_FP16] = have[CV_CPU_NEON_FP16] = true;
#endif
// system.cpp may be compiled w/o special -march=armv8...+dotprod, -march=armv8...+bf16 etc.,
// so we check for the features in any case, no mater what are the compile flags.
// We check the real hardware capabilities here.
int has_feat_dotprod = 0;
size_t has_feat_dotprod_size = sizeof(has_feat_dotprod);
sysctlbyname("hw.optional.arm.FEAT_DotProd", &has_feat_dotprod, &has_feat_dotprod_size, NULL, 0);
if (has_feat_dotprod) {
have[CV_CPU_NEON_DOTPROD] = true;
}
int has_feat_bf16 = 0;
size_t has_feat_bf16_size = sizeof(has_feat_bf16);
sysctlbyname("hw.optional.arm.FEAT_BF16", &has_feat_bf16, &has_feat_bf16_size, NULL, 0);
if (has_feat_bf16) {
have[CV_CPU_NEON_BF16] = true;
}
#elif (defined __clang__)
#if (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
have[CV_CPU_NEON] = true;
Expand Down
0