MVE - Multi-View Environment mve-devel
Loading...
Searching...
No Matches
nearest_neighbor.cc
Go to the documentation of this file.
1/*
2 * Copyright (C) 2015, Simon Fuhrmann, Stepan Konrad
3 * TU Darmstadt - Graphics, Capture and Massively Parallel Computing
4 * All rights reserved.
5 *
6 * This software may be modified and distributed under the terms
7 * of the BSD 3-Clause license. See the LICENSE.txt file for details.
8 *
9 * A helpful SSE/MMX overview.
10 * Taken from: http://www.linuxjournal.com/content/
11 * ... introduction-gcc-compiler-intrinsics-vector-processing
12 *
13 * Compiler Options:
14 * - X86/MMX/SSE1/2/3 -mfpmath=sse -mmmx -msse -msse2 -msse3
15 * - ARM Neon -mfpu=neon -mfloat-abi=softfp
16 * - Freescale Altivec -maltivec -mabi=altivec
17 *
18 * Include Files:
19 * - arm_neon.h ARM Neon types & intrinsics
20 * - altivec.h Freescale Altivec types & intrinsics
21 * - mmintrin.h X86 MMX
22 * - xmmintrin.h X86 SSE1
23 * - emmintrin.h X86 SSE2
24 * - pmmintrin.h X86 SSE3
25 *
26 * MMX/SSE Data Types:
27 * - MMX: __m64 64 bits of integers.
28 * - SSE1: __m128 128 bits: four single precision floats.
29 * - SSE2: __m128i 128 bits of packed integers, __m128d 128 bits: two doubles.
30 *
31 * Macros to check for availability:
32 * - X86 MMX __MMX__
33 * - X86 SSE __SSE__
34 * - X86 SSE2 __SSE2__
35 * - X86 SSE3 __SSE3__
36 * - altivec functions __VEC__
37 * - neon functions __ARM_NEON__
38 */
39
40#include <algorithm>
41#include <iostream>
42#include <emmintrin.h> // SSE2
43#include <pmmintrin.h> // SSE3
44
46
48
49namespace
50{
51 /*
52 * Signed and unsigned short inner product implementation. Stores the
53 * largest and second largest inner product of query with elements.
54 *
55 * For SSE query and result should be 16 byte aligned.
56 * Otherwise loading and storing values into/from registers is slow.
57 * The dimension size must be divisible by 8, each __m128i register
58 * can load 8 shorts = 16 bytes = 128 bit.
59 */
60 template <typename T>
61 void
62 short_inner_prod (T const* query,
63 typename NearestNeighbor<T>::Result* result,
64 T const* elements, int num_elements, int dimensions)
65 {
66#if ENABLE_SSE2_NN_SEARCH && defined(__SSE2__)
67 /* Using a constant number reduces computation time by about 1/3. */
68 int const dim_8 = dimensions / 8;
69 __m128i const* descr_ptr = reinterpret_cast<__m128i const*>(elements);
70 for (int descr_iter = 0; descr_iter < num_elements; ++descr_iter)
71 {
72 /* Compute dot product between query and candidate. */
73 __m128i const* query_ptr = reinterpret_cast<__m128i const*>(query);
74 __m128i reg_result = _mm_set1_epi16(0);
75 for (int i = 0; i < dim_8; ++i, ++query_ptr, ++descr_ptr)
76 {
77 __m128i reg_query = _mm_load_si128(query_ptr);
78 __m128i reg_subject = _mm_load_si128(descr_ptr);
79 reg_result = _mm_add_epi16(reg_result,
80 _mm_mullo_epi16(reg_query, reg_subject));
81 }
82 T const* tmp = reinterpret_cast<T const*>(&reg_result);
83 int inner_product = tmp[0] + tmp[1] + tmp[2] + tmp[3]
84 + tmp[4] + tmp[5] + tmp[6] + tmp[7];
85
86 /* Check if new largest inner product has been found. */
87 if (inner_product >= result->dist_2nd_best)
88 {
89 if (inner_product >= result->dist_1st_best)
90 {
91 result->index_2nd_best = result->index_1st_best;
92 result->dist_2nd_best = result->dist_1st_best;
93 result->index_1st_best = descr_iter;
94 result->dist_1st_best = inner_product;
95 }
96 else
97 {
98 result->index_2nd_best = descr_iter;
99 result->dist_2nd_best = inner_product;
100 }
101 }
102 }
103#else
104 T const* descr_ptr = elements;
105 for (int i = 0; i < num_elements; ++i)
106 {
107 int inner_product = 0;
108 for (int i = 0; i < dimensions; ++i, ++descr_ptr)
109 inner_product += query[i] * *descr_ptr;
110
111 /* Check if new largest inner product has been found. */
112 if (inner_product >= result->dist_2nd_best)
113 {
114 if (inner_product >= result->dist_1st_best)
115 {
116 result->index_2nd_best = result->index_1st_best;
117 result->dist_2nd_best = result->dist_1st_best;
118 result->index_1st_best = i;
119 result->dist_1st_best = inner_product;
120 }
121 else
122 {
123 result->index_2nd_best = i;
124 result->dist_2nd_best = inner_product;
125 }
126 }
127 }
128#endif
129 }
130
131 /*
132 * Float inner product implementation. Stores the largest and
133 * second largest inner product of query with elements.
134 *
135 * For SSE query and result should be 16 byte aligned.
136 * Otherwise loading and storing values into/from registers is slow.
137 * The dimension size must be divisible by 4, each __m128i register
138 * can load 4 shorts = 16 bytes = 128 bit.
139 */
140 void
141 float_inner_prod (float const* query,
142 NearestNeighbor<float>::Result* result,
143 float const* elements, int num_elements, int dimensions)
144 {
145#if ENABLE_SSE3_NN_SEARCH && defined(__SSE3__)
146 /*
147 * SSE inner product implementation.
148 * Note that query and result should be 16 byte aligned.
149 * Otherwise loading and storing values into/from registers is slow.
150 * The dimension size must be divisible by 4, each __m128 register
151 * can load 4 floats = 16 bytes = 128 bit.
152 */
153
154 __m128 const* descr_ptr = reinterpret_cast<__m128 const*>(elements);
155 int const dim_4 = dimensions / 4;
156 for (int descr_iter = 0; descr_iter < num_elements; ++descr_iter)
157 {
158 /* Compute dot product between query and candidate. */
159 __m128 const* query_ptr = reinterpret_cast<__m128 const*>(query);
160 __m128 sum = _mm_setzero_ps();
161 for (int i = 0; i < dim_4; ++i, ++query_ptr, ++descr_ptr)
162 sum = _mm_add_ps(sum, _mm_mul_ps(*query_ptr, *descr_ptr));
163 sum = _mm_hadd_ps(sum, sum);
164 sum = _mm_hadd_ps(sum, sum);
165
166 /* Check if new largest inner product has been found. */
167 float inner_product = _mm_cvtss_f32(sum);
168 if (inner_product >= result->dist_2nd_best)
169 {
170 if (inner_product >= result->dist_1st_best)
171 {
172 result->index_2nd_best = result->index_1st_best;
173 result->dist_2nd_best = result->dist_1st_best;
174 result->index_1st_best = descr_iter;
175 result->dist_1st_best = inner_product;
176 }
177 else
178 {
179 result->index_2nd_best = descr_iter;
180 result->dist_2nd_best = inner_product;
181 }
182 }
183 }
184#else
185 float const* descr_ptr = elements;
186 for (int i = 0; i < num_elements; ++i)
187 {
188 float inner_product = 0.0f;
189 for (int j = 0; j < dimensions; ++j, ++descr_ptr)
190 inner_product += query[j] * *descr_ptr;
191
192 /* Check if new largest inner product has been found. */
193 if (inner_product >= result->dist_2nd_best)
194 {
195 if (inner_product >= result->dist_1st_best)
196 {
197 result->index_2nd_best = result->index_1st_best;
198 result->dist_2nd_best = result->dist_1st_best;
199 result->index_1st_best = i;
200 result->dist_1st_best = inner_product;
201 }
202 else
203 {
204 result->index_2nd_best = i;
205 result->dist_2nd_best = inner_product;
206 }
207 }
208 }
209#endif
210 }
211
212}
213
214template <>
215void
217 NearestNeighbor<short>::Result* result) const
218{
219 /* Result distances are shamelessly misused to store inner products. */
220 result->dist_1st_best = 0;
221 result->dist_2nd_best = 0;
222 result->index_1st_best = 0;
223 result->index_2nd_best = 0;
224
225 short_inner_prod<short>(query, result, this->elements,
226 this->num_elements, this->dimensions);
227
228 /*
229 * Compute actual square distances.
230 * The distance with 'signed char' vectors is: 2 * 127^2 - 2 * <Q, Ci>.
231 * The maximum distance is (2*127)^2, which unfortunately does not fit
232 * in a signed short. Therefore, the distance is clapmed at 127^2.
233 */
234 result->dist_1st_best = std::min(16129, std::max(0, (int)result->dist_1st_best));
235 result->dist_2nd_best = std::min(16129, std::max(0, (int)result->dist_2nd_best));
236 result->dist_1st_best = 32258 - 2 * result->dist_1st_best;
237 result->dist_2nd_best = 32258 - 2 * result->dist_2nd_best;
238}
239
240template <>
241void
242NearestNeighbor<unsigned short>::find (unsigned short const* query,
244{
245 /* Result distances are shamelessly misused to store inner products. */
246 result->dist_1st_best = 0;
247 result->dist_2nd_best = 0;
248 result->index_1st_best = 0;
249 result->index_2nd_best = 0;
250
251 short_inner_prod<unsigned short>(query, result, this->elements,
252 this->num_elements, this->dimensions);
253
254 /*
255 * Compute actual square distances.
256 * The distance with 'unsigned char' vectors is: 2 * 255^2 - 2 * <Q, Ci>.
257 * The maximum distance is (2*255)^2, which unfortunately does not fit
258 * in a unsigned short. Therefore, the result distance is clapmed:
259 * 2 * 255^2 - 2 * <Q, Ci> = 2 * (255^2 - <Q, Ci>) and (255^2 - <Q, Ci>)
260 * is clamped to 32767 and then multiplied by 2.
261 */
262 result->dist_1st_best = std::min(65025, (int)result->dist_1st_best);
263 result->dist_2nd_best = std::min(65025, (int)result->dist_2nd_best);
264 result->dist_1st_best = 65025 - result->dist_1st_best;
265 result->dist_2nd_best = 65025 - result->dist_2nd_best;
266 result->dist_1st_best = std::min(32767, (int)result->dist_1st_best) * 2;
267 result->dist_2nd_best = std::min(32767, (int)result->dist_2nd_best) * 2;
268}
269
270template <>
271void
273 NearestNeighbor<float>::Result* result) const
274{
275 /* Result distances are shamelessly misused to store inner products. */
276 result->dist_1st_best = 0.0f;
277 result->dist_2nd_best = 0.0f;
278 result->index_1st_best = 0;
279 result->index_2nd_best = 0;
280
281 float_inner_prod(query, result, this->elements,
282 this->num_elements, this->dimensions);
283
284 /*
285 * Compute actual (square) distances.
286 */
287 result->dist_1st_best = std::max(0.0f, 2.0f - 2.0f * result->dist_1st_best);
288 result->dist_2nd_best = std::max(0.0f, 2.0f - 2.0f * result->dist_2nd_best);
289}
290
Nearest (and second nearest) neighbor search for normalized vectors.
#define SFM_NAMESPACE_END
Definition defines.h:14
#define SFM_NAMESPACE_BEGIN
Definition defines.h:13
Unlike the naming suggests, these are square distances.