最近在写界面时需要根据一幅图片中指定颜色作为透明色生成带alpha透明度的32位图片, 比较特殊的要求是透明色不是纯色,需要判断颜色相似度在一定范围内都作为透明色并根据相似度设成不同的alpha, 原来打算将RGB值转换成HSL然后判断H和L计算相似度和透明度再转回RGB, 尝试之后发觉效果并不理想而且速度慢了点. 后来把主意打到了3D距离公式上,即计算sqrt(R*R+G*G+B*B)再和阈值比较来计算实际透明度。试下来效果不错。接下来就是优化了, 显然优化首选当然是计算中的sqrt()。 在网上逛了一天, 又冥思苦想了几小时,终于写出了自己的float sqrt(float)函数,测试下来速度比CRT的sqrtf()快了整一倍,平均错误不超过0.0005, 最大错误0.0015左右。 用到程序中对图片的处理速度也提高了近75%。个人还是比较满意这个结果的。 现在公布出来一起分享一下。先说说原理, 实际上和著名的Quake3的invsqrt()差不多, 都是通过直接对float浮点格式进行位移操作获取近似值,然后用逼近公式迭代一次以获得满足一定精度的最终结果。比较头疼的是迭代公式中原本存在除法, 而除法在CPU/FPU中都是相对比较慢而且并行很差的. 最后不得已我在函数中只能用很粗暴的查表法来解决这个问题了. 我实际应用中程序里保存了处理过的1.0~2.0之间9bit精度的512个倒数. 也试过只存8bit精度的倒数,不过相应地错误率也扩大了一倍--虽然不影响我的这个应用. 既然1k内存和2k内存的差别对L1 cache命中率几乎没有影响(实际使用得出的结论), 那么当然精度更高点好了.下面的测试程序比较了3种快速sqrt()方法的差别, 分别列出计算0-0x1000000之间所有整数的sqrt后, 相对CRT的sqrtf()的速度比, 平均错误率, 最大错误率, 最大错误发生的原始数与具体偏差的值. 其中:
float qsqrt(float)是我写的sqrt()函数
float qsqrt2(float)是改造的Quake3的invsqrt()
float qsqrt3(float)是只位移, 不逼近, 直接获得近似结果release版本下测试结果:
使用 8bit 倒数表时: SPEEDUP AVG(%) MAX(%) @VALvia_lut(): 2.00 0.0798 0.2655 4538365( 2130.3438 +5.655273)
via_inv(): 1.74 0.1001 0.1752 15643569( 3955.1953 -6.930664)
via_bin(): 3.13 1.5444 3.5164 4505600( 2122.6399 -74.639893)使用 9bit 倒数表时: SPEEDUP AVG(%) MAX(%) @VALvia_lut(): 2.00 0.0480 0.1648 4521982( 2126.4951 +3.504150)
via_inv(): 1.74 0.1001 0.1752 15643569( 3955.1953 -6.930664)
via_bin(): 3.13 1.5444 3.5164 4505600( 2122.6399 -74.639893)
生成倒数表的代码:
void fillrecip9(DWORD* arr)
{
union {
int i;
float f;
} u;
for( int i = 0; i < 512; ++i )
{
u.i = (i<<14)+0x3f800000;
u.f = 1.0f/u.f;
arr[i] = u.i-0x3f800000;
}}void fillrecip8(DWORD* arr)
{
union {
int i;
float f;
} u;
for( int i = 0; i < 256; ++i )
{
u.i = (i<<15)+0x3f800000;
u.f = 1.0f/u.f;
arr[i] = u.i-0x3f800000;
}}
具体测试代码:#include <windows.h>
#include "stdafx.h"
#include <conio.h>
#include <math.h>//static DWORD recip8[256] = {
// 0x00000000, 0xffff00ff, 0xfffe03f8, 0xfffd08e5, 0xfffc0fc1, 0xfffb1885, 0xfffa232d, 0xfff92fb2,
// 0xfff83e10, 0xfff74e40, 0xfff6603e, 0xfff57404, 0xfff4898d, 0xfff3a0d5, 0xfff2b9d6, 0xfff1d48c,
// 0xfff0f0f1, 0xfff00f01, 0xffef2eb7, 0xffee500f, 0xffed7304, 0xffec9791, 0xffebbdb3, 0xffeae564,
// 0xffea0ea1, 0xffe93965, 0xffe865ac, 0xffe79373, 0xffe6c2b4, 0xffe5f36d, 0xffe52598, 0xffe45933,
// 0xffe38e39, 0xffe2c4a7, 0xffe1fc78, 0xffe135aa, 0xffe07038, 0xffdfac1f, 0xffdee95c, 0xffde27eb,
// 0xffdd67c9, 0xffdca8f1, 0xffdbeb62, 0xffdb2f17, 0xffda740e, 0xffd9ba42, 0xffd901b2, 0xffd84a5a,
// 0xffd79436, 0xffd6df44, 0xffd62b81, 0xffd578e9, 0xffd4c77b, 0xffd41733, 0xffd3680d, 0xffd2ba08,
// 0xffd20d21, 0xffd16154, 0xffd0b6a0, 0xffd00d01, 0xffcf6475, 0xffcebcf9, 0xffce168a, 0xffcd7127,
// 0xffcccccd, 0xffcc2978, 0xffcb8728, 0xffcae5d8, 0xffca4588, 0xffc9a634, 0xffc907da, 0xffc86a79,
// 0xffc7ce0c, 0xffc73294, 0xffc6980c, 0xffc5fe74, 0xffc565c8, 0xffc4ce08, 0xffc43730, 0xffc3a13e,
// 0xffc30c31, 0xffc27806, 0xffc1e4bc, 0xffc15250, 0xffc0c0c1, 0xffc0300c, 0xffbfa030, 0xffbf112b,
// 0xffbe82fa, 0xffbdf59d, 0xffbd6910, 0xffbcdd53, 0xffbc5264, 0xffbbc841, 0xffbb3ee7, 0xffbab656,
// 0xffba2e8c, 0xffb9a786, 0xffb92144, 0xffb89bc3, 0xffb81703, 0xffb79301, 0xffb70fbb, 0xffb68d31,
// 0xffb60b61, 0xffb58a48, 0xffb509e7, 0xffb48a3a, 0xffb40b41, 0xffb38cfa, 0xffb30f63, 0xffb2927c,
// 0xffb21643, 0xffb19ab6, 0xffb11fd4, 0xffb0a59b, 0xffb02c0b, 0xffafb322, 0xffaf3ade, 0xffaec33e,
// 0xffae4c41, 0xffadd5e6, 0xffad602b, 0xffaceb10, 0xffac7692, 0xffac02b0, 0xffab8f6a, 0xffab1cbe,
// 0xffaaaaab, 0xffaa392f, 0xffa9c84a, 0xffa957fb, 0xffa8e83f, 0xffa87917, 0xffa80a81, 0xffa79c7b,
// 0xffa72f05, 0xffa6c21e, 0xffa655c4, 0xffa5e9f7, 0xffa57eb5, 0xffa513fd, 0xffa4a9cf, 0xffa44029,
// 0xffa3d70a, 0xffa36e72, 0xffa3065e, 0xffa29ecf, 0xffa237c3, 0xffa1d13a, 0xffa16b31, 0xffa105a9,
// 0xffa0a0a1, 0xffa03c17, 0xff9fd80a, 0xff9f747a, 0xff9f1166, 0xff9eaecd, 0xff9e4cad, 0xff9deb07,
// 0xff9d89d9, 0xff9d2922, 0xff9cc8e1, 0xff9c6917, 0xff9c09c1, 0xff9baadf, 0xff9b4c70, 0xff9aee73,
// 0xff9a90e8, 0xff9a33cd, 0xff99d723, 0xff997ae7, 0xff991f1a, 0xff98c3bb, 0xff9868c8, 0xff980e41,
// 0xff97b426, 0xff975a75, 0xff97012e, 0xff96a850, 0xff964fda, 0xff95f7cc, 0xff95a025, 0xff9548e5,
// 0xff94f209, 0xff949b93, 0xff944581, 0xff93efd2, 0xff939a86, 0xff93459c, 0xff92f114, 0xff929cec,
// 0xff924925, 0xff91f5bd, 0xff91a2b4, 0xff915009, 0xff90fdbc, 0xff90abcc, 0xff905a38, 0xff900901,
// 0xff8fb824, 0xff8f67a2, 0xff8f177a, 0xff8ec7ab, 0xff8e7835, 0xff8e2918, 0xff8dda52, 0xff8d8be3,
// 0xff8d3dcb, 0xff8cf009, 0xff8ca29c, 0xff8c5584, 0xff8c08c1, 0xff8bbc51, 0xff8b7034, 0xff8b246b,
// 0xff8ad8f3, 0xff8a8dcd, 0xff8a42f8, 0xff89f874, 0xff89ae41, 0xff89645c, 0xff891ac7, 0xff88d181,
// 0xff888889, 0xff883fde, 0xff87f781, 0xff87af70, 0xff8767ab, 0xff872033, 0xff86d905, 0xff869223,
// 0xff864b8a, 0xff86053c, 0xff85bf37, 0xff85797c, 0xff853408, 0xff84eedd, 0xff84a9fa, 0xff84655e,
// 0xff842108, 0xff83dcf9, 0xff839930, 0xff8355ad, 0xff83126f, 0xff82cf75, 0xff828cc0, 0xff824a4e,
// 0xff820821, 0xff81c636, 0xff81848e, 0xff814328, 0xff810204, 0xff80c122, 0xff808081, 0xff804020
//};static DWORD recip9[512] = {... (我KAO!) 提示信息帖子内容过长
float qsqrt(float)是我写的sqrt()函数
float qsqrt2(float)是改造的Quake3的invsqrt()
float qsqrt3(float)是只位移, 不逼近, 直接获得近似结果release版本下测试结果:
使用 8bit 倒数表时: SPEEDUP AVG(%) MAX(%) @VALvia_lut(): 2.00 0.0798 0.2655 4538365( 2130.3438 +5.655273)
via_inv(): 1.74 0.1001 0.1752 15643569( 3955.1953 -6.930664)
via_bin(): 3.13 1.5444 3.5164 4505600( 2122.6399 -74.639893)使用 9bit 倒数表时: SPEEDUP AVG(%) MAX(%) @VALvia_lut(): 2.00 0.0480 0.1648 4521982( 2126.4951 +3.504150)
via_inv(): 1.74 0.1001 0.1752 15643569( 3955.1953 -6.930664)
via_bin(): 3.13 1.5444 3.5164 4505600( 2122.6399 -74.639893)
生成倒数表的代码:
void fillrecip9(DWORD* arr)
{
union {
int i;
float f;
} u;
for( int i = 0; i < 512; ++i )
{
u.i = (i<<14)+0x3f800000;
u.f = 1.0f/u.f;
arr[i] = u.i-0x3f800000;
}}void fillrecip8(DWORD* arr)
{
union {
int i;
float f;
} u;
for( int i = 0; i < 256; ++i )
{
u.i = (i<<15)+0x3f800000;
u.f = 1.0f/u.f;
arr[i] = u.i-0x3f800000;
}}
具体测试代码:#include <windows.h>
#include "stdafx.h"
#include <conio.h>
#include <math.h>//static DWORD recip8[256] = {
// 0x00000000, 0xffff00ff, 0xfffe03f8, 0xfffd08e5, 0xfffc0fc1, 0xfffb1885, 0xfffa232d, 0xfff92fb2,
// 0xfff83e10, 0xfff74e40, 0xfff6603e, 0xfff57404, 0xfff4898d, 0xfff3a0d5, 0xfff2b9d6, 0xfff1d48c,
// 0xfff0f0f1, 0xfff00f01, 0xffef2eb7, 0xffee500f, 0xffed7304, 0xffec9791, 0xffebbdb3, 0xffeae564,
// 0xffea0ea1, 0xffe93965, 0xffe865ac, 0xffe79373, 0xffe6c2b4, 0xffe5f36d, 0xffe52598, 0xffe45933,
// 0xffe38e39, 0xffe2c4a7, 0xffe1fc78, 0xffe135aa, 0xffe07038, 0xffdfac1f, 0xffdee95c, 0xffde27eb,
// 0xffdd67c9, 0xffdca8f1, 0xffdbeb62, 0xffdb2f17, 0xffda740e, 0xffd9ba42, 0xffd901b2, 0xffd84a5a,
// 0xffd79436, 0xffd6df44, 0xffd62b81, 0xffd578e9, 0xffd4c77b, 0xffd41733, 0xffd3680d, 0xffd2ba08,
// 0xffd20d21, 0xffd16154, 0xffd0b6a0, 0xffd00d01, 0xffcf6475, 0xffcebcf9, 0xffce168a, 0xffcd7127,
// 0xffcccccd, 0xffcc2978, 0xffcb8728, 0xffcae5d8, 0xffca4588, 0xffc9a634, 0xffc907da, 0xffc86a79,
// 0xffc7ce0c, 0xffc73294, 0xffc6980c, 0xffc5fe74, 0xffc565c8, 0xffc4ce08, 0xffc43730, 0xffc3a13e,
// 0xffc30c31, 0xffc27806, 0xffc1e4bc, 0xffc15250, 0xffc0c0c1, 0xffc0300c, 0xffbfa030, 0xffbf112b,
// 0xffbe82fa, 0xffbdf59d, 0xffbd6910, 0xffbcdd53, 0xffbc5264, 0xffbbc841, 0xffbb3ee7, 0xffbab656,
// 0xffba2e8c, 0xffb9a786, 0xffb92144, 0xffb89bc3, 0xffb81703, 0xffb79301, 0xffb70fbb, 0xffb68d31,
// 0xffb60b61, 0xffb58a48, 0xffb509e7, 0xffb48a3a, 0xffb40b41, 0xffb38cfa, 0xffb30f63, 0xffb2927c,
// 0xffb21643, 0xffb19ab6, 0xffb11fd4, 0xffb0a59b, 0xffb02c0b, 0xffafb322, 0xffaf3ade, 0xffaec33e,
// 0xffae4c41, 0xffadd5e6, 0xffad602b, 0xffaceb10, 0xffac7692, 0xffac02b0, 0xffab8f6a, 0xffab1cbe,
// 0xffaaaaab, 0xffaa392f, 0xffa9c84a, 0xffa957fb, 0xffa8e83f, 0xffa87917, 0xffa80a81, 0xffa79c7b,
// 0xffa72f05, 0xffa6c21e, 0xffa655c4, 0xffa5e9f7, 0xffa57eb5, 0xffa513fd, 0xffa4a9cf, 0xffa44029,
// 0xffa3d70a, 0xffa36e72, 0xffa3065e, 0xffa29ecf, 0xffa237c3, 0xffa1d13a, 0xffa16b31, 0xffa105a9,
// 0xffa0a0a1, 0xffa03c17, 0xff9fd80a, 0xff9f747a, 0xff9f1166, 0xff9eaecd, 0xff9e4cad, 0xff9deb07,
// 0xff9d89d9, 0xff9d2922, 0xff9cc8e1, 0xff9c6917, 0xff9c09c1, 0xff9baadf, 0xff9b4c70, 0xff9aee73,
// 0xff9a90e8, 0xff9a33cd, 0xff99d723, 0xff997ae7, 0xff991f1a, 0xff98c3bb, 0xff9868c8, 0xff980e41,
// 0xff97b426, 0xff975a75, 0xff97012e, 0xff96a850, 0xff964fda, 0xff95f7cc, 0xff95a025, 0xff9548e5,
// 0xff94f209, 0xff949b93, 0xff944581, 0xff93efd2, 0xff939a86, 0xff93459c, 0xff92f114, 0xff929cec,
// 0xff924925, 0xff91f5bd, 0xff91a2b4, 0xff915009, 0xff90fdbc, 0xff90abcc, 0xff905a38, 0xff900901,
// 0xff8fb824, 0xff8f67a2, 0xff8f177a, 0xff8ec7ab, 0xff8e7835, 0xff8e2918, 0xff8dda52, 0xff8d8be3,
// 0xff8d3dcb, 0xff8cf009, 0xff8ca29c, 0xff8c5584, 0xff8c08c1, 0xff8bbc51, 0xff8b7034, 0xff8b246b,
// 0xff8ad8f3, 0xff8a8dcd, 0xff8a42f8, 0xff89f874, 0xff89ae41, 0xff89645c, 0xff891ac7, 0xff88d181,
// 0xff888889, 0xff883fde, 0xff87f781, 0xff87af70, 0xff8767ab, 0xff872033, 0xff86d905, 0xff869223,
// 0xff864b8a, 0xff86053c, 0xff85bf37, 0xff85797c, 0xff853408, 0xff84eedd, 0xff84a9fa, 0xff84655e,
// 0xff842108, 0xff83dcf9, 0xff839930, 0xff8355ad, 0xff83126f, 0xff82cf75, 0xff828cc0, 0xff824a4e,
// 0xff820821, 0xff81c636, 0xff81848e, 0xff814328, 0xff810204, 0xff80c122, 0xff808081, 0xff804020
//};static DWORD recip9[512] = {... (我KAO!) 提示信息帖子内容过长
0x00000000, 0xffff8040, 0xffff00ff, 0xfffe823d, 0xfffe03f8, 0xfffd8631, 0xfffd08e5, 0xfffc8c16,
0xfffc0fc1, 0xfffb93e6, 0xfffb1885, 0xfffa9d9d, 0xfffa232d, 0xfff9a934, 0xfff92fb2, 0xfff8b6a6,
0xfff83e10, 0xfff7c5ee, 0xfff74e40, 0xfff6d705, 0xfff6603e, 0xfff5e9e8, 0xfff57404, 0xfff4fe91,
0xfff4898d, 0xfff414fa, 0xfff3a0d5, 0xfff32d1f, 0xfff2b9d6, 0xfff246fb, 0xfff1d48c, 0xfff16289,
0xfff0f0f1, 0xfff07fc4, 0xfff00f01, 0xffef9ea8, 0xffef2eb7, 0xffeebf2f, 0xffee500f, 0xffede156,
0xffed7304, 0xffed0518, 0xffec9791, 0xffec2a70, 0xffebbdb3, 0xffeb515a, 0xffeae564, 0xffea79d1,
0xffea0ea1, 0xffe9a3d2, 0xffe93965, 0xffe8cf59, 0xffe865ac, 0xffe7fc60, 0xffe79373, 0xffe72ae4,
0xffe6c2b4, 0xffe65ae2, 0xffe5f36d, 0xffe58c54, 0xffe52598, 0xffe4bf38, 0xffe45933, 0xffe3f389,
0xffe38e39, 0xffe32943, 0xffe2c4a7, 0xffe26063, 0xffe1fc78, 0xffe198e5, 0xffe135aa, 0xffe0d2c6,
0xffe07038, 0xffe00e01, 0xffdfac1f, 0xffdf4a93, 0xffdee95c, 0xffde887a, 0xffde27eb, 0xffddc7b0,
0xffdd67c9, 0xffdd0834, 0xffdca8f1, 0xffdc4a01, 0xffdbeb62, 0xffdb8d14, 0xffdb2f17, 0xffdad16a,
0xffda740e, 0xffda1700, 0xffd9ba42, 0xffd95dd3, 0xffd901b2, 0xffd8a5df, 0xffd84a5a, 0xffd7ef21,
0xffd79436, 0xffd73997, 0xffd6df44, 0xffd6853d, 0xffd62b81, 0xffd5d210, 0xffd578e9, 0xffd5200d,
0xffd4c77b, 0xffd46f32, 0xffd41733, 0xffd3bf7c, 0xffd3680d, 0xffd310e7, 0xffd2ba08, 0xffd26371,
0xffd20d21, 0xffd1b717, 0xffd16154, 0xffd10bd7, 0xffd0b6a0, 0xffd061ae, 0xffd00d01, 0xffcfb899,
0xffcf6475, 0xffcf1095, 0xffcebcf9, 0xffce69a0, 0xffce168a, 0xffcdc3b8, 0xffcd7127, 0xffcd1ed9,
0xffcccccd, 0xffcc7b02, 0xffcc2978, 0xffcbd830, 0xffcb8728, 0xffcb3660, 0xffcae5d8, 0xffca9590,
0xffca4588, 0xffc9f5bf, 0xffc9a634, 0xffc956e8, 0xffc907da, 0xffc8b90b, 0xffc86a79, 0xffc81c24,
0xffc7ce0c, 0xffc78032, 0xffc73294, 0xffc6e532, 0xffc6980c, 0xffc64b22, 0xffc5fe74, 0xffc5b201,
0xffc565c8, 0xffc519cb, 0xffc4ce08, 0xffc4827f, 0xffc43730, 0xffc3ec1a, 0xffc3a13e, 0xffc3569b,
0xffc30c31, 0xffc2c1ff, 0xffc27806, 0xffc22e45, 0xffc1e4bc, 0xffc19b6a, 0xffc15250, 0xffc1096d,
0xffc0c0c1, 0xffc0784b, 0xffc0300c, 0xffbfe803, 0xffbfa030, 0xffbf5892, 0xffbf112b, 0xffbec9f8,
0xffbe82fa, 0xffbe3c31, 0xffbdf59d, 0xffbdaf3c, 0xffbd6910, 0xffbd2318, 0xffbcdd53, 0xffbc97c2,
0xffbc5264, 0xffbc0d39, 0xffbbc841, 0xffbb837b, 0xffbb3ee7, 0xffbafa86, 0xffbab656, 0xffba7258,
0xffba2e8c, 0xffb9eaf0, 0xffb9a786, 0xffb9644d, 0xffb92144, 0xffb8de6c, 0xffb89bc3, 0xffb8594b,
0xffb81703, 0xffb7d4ea, 0xffb79301, 0xffb75147, 0xffb70fbb, 0xffb6ce5f, 0xffb68d31, 0xffb64c32,
0xffb60b61, 0xffb5cabe, 0xffb58a48, 0xffb54a01, 0xffb509e7, 0xffb4c9fa, 0xffb48a3a, 0xffb44aa7,
0xffb40b41, 0xffb3cc07, 0xffb38cfa, 0xffb34e19, 0xffb30f63, 0xffb2d0da, 0xffb2927c, 0xffb2544a,
0xffb21643, 0xffb1d867, 0xffb19ab6, 0xffb15d2f, 0xffb11fd4, 0xffb0e2a2, 0xffb0a59b, 0xffb068be,
0xffb02c0b, 0xffafef82, 0xffafb322, 0xffaf76eb, 0xffaf3ade, 0xffaefefa, 0xffaec33e, 0xffae87ab,
0xffae4c41, 0xffae1100, 0xffadd5e6, 0xffad9af5, 0xffad602b, 0xffad258a, 0xffaceb10, 0xffacb0bd,
0xffac7692, 0xffac3c8d, 0xffac02b0, 0xffabc8fa, 0xffab8f6a, 0xffab5601, 0xffab1cbe, 0xffaae3a1,
0xffaaaaab, 0xffaa71da, 0xffaa392f, 0xffaa00aa, 0xffa9c84a, 0xffa99010, 0xffa957fb, 0xffa9200b,
0xffa8e83f, 0xffa8b099, 0xffa87917, 0xffa841ba, 0xffa80a81, 0xffa7d36c, 0xffa79c7b, 0xffa765ae,
0xffa72f05, 0xffa6f880, 0xffa6c21e, 0xffa68bdf, 0xffa655c4, 0xffa61fcc, 0xffa5e9f7, 0xffa5b445,
0xffa57eb5, 0xffa54948, 0xffa513fd, 0xffa4ded5, 0xffa4a9cf, 0xffa474eb, 0xffa44029, 0xffa40b89,
0xffa3d70a, 0xffa3a2ad, 0xffa36e72, 0xffa33a57, 0xffa3065e, 0xffa2d286, 0xffa29ecf, 0xffa26b39,
0xffa237c3, 0xffa2046e, 0xffa1d13a, 0xffa19e25, 0xffa16b31, 0xffa1385d, 0xffa105a9, 0xffa0d315,
0xffa0a0a1, 0xffa06e4c, 0xffa03c17, 0xffa00a01, 0xff9fd80a, 0xff9fa633, 0xff9f747a, 0xff9f42e1,
0xff9f1166, 0xff9ee00a, 0xff9eaecd, 0xff9e7dae, 0xff9e4cad, 0xff9e1bcb, 0xff9deb07, 0xff9dba61,
0xff9d89d9, 0xff9d596e, 0xff9d2922, 0xff9cf8f3, 0xff9cc8e1, 0xff9c98ed, 0xff9c6917, 0xff9c395d,
0xff9c09c1, 0xff9bda41, 0xff9baadf, 0xff9b7b99, 0xff9b4c70, 0xff9b1d63, 0xff9aee73, 0xff9abf9f,
0xff9a90e8, 0xff9a624d, 0xff9a33cd, 0xff9a056a, 0xff99d723, 0xff99a8f7, 0xff997ae7, 0xff994cf3,
0xff991f1a, 0xff98f15d, 0xff98c3bb, 0xff989634, 0xff9868c8, 0xff983b77, 0xff980e41, 0xff97e126,
0xff97b426, 0xff978740, 0xff975a75, 0xff972dc4, 0xff97012e, 0xff96d4b2, 0xff96a850, 0xff967c08,
0xff964fda, 0xff9623c7, 0xff95f7cc, 0xff95cbec, 0xff95a025, 0xff957478, 0xff9548e5, 0xff951d6a,
0xff94f209, 0xff94c6c2, 0xff949b93, 0xff94707d, 0xff944581, 0xff941a9d, 0xff93efd2, 0xff93c51f,
0xff939a86, 0xff937005, 0xff93459c, 0xff931b4c, 0xff92f114, 0xff92c6f4, 0xff929cec, 0xff9272fc,
0xff924925, 0xff921f65, 0xff91f5bd, 0xff91cc2c, 0xff91a2b4, 0xff917953, 0xff915009, 0xff9126d7,
0xff90fdbc, 0xff90d4b8, 0xff90abcc, 0xff9082f7, 0xff905a38, 0xff903191, 0xff900901, 0xff8fe087,
0xff8fb824, 0xff8f8fd8, 0xff8f67a2, 0xff8f3f83, 0xff8f177a, 0xff8eef87, 0xff8ec7ab, 0xff8e9fe5,
0xff8e7835, 0xff8e509c, 0xff8e2918, 0xff8e01aa, 0xff8dda52, 0xff8db310, 0xff8d8be3, 0xff8d64cc,
0xff8d3dcb, 0xff8d16df, 0xff8cf009, 0xff8cc948, 0xff8ca29c, 0xff8c7c05, 0xff8c5584, 0xff8c2f18,
0xff8c08c1, 0xff8be27e, 0xff8bbc51, 0xff8b9638, 0xff8b7034, 0xff8b4a45, 0xff8b246b, 0xff8afea5,
0xff8ad8f3, 0xff8ab356, 0xff8a8dcd, 0xff8a6859, 0xff8a42f8, 0xff8a1dac, 0xff89f874, 0xff89d350,
0xff89ae41, 0xff898944, 0xff89645c, 0xff893f88, 0xff891ac7, 0xff88f61a, 0xff88d181, 0xff88acfb,
0xff888889, 0xff88642a, 0xff883fde, 0xff881ba6, 0xff87f781, 0xff87d36f, 0xff87af70, 0xff878b84,
0xff8767ab, 0xff8743e6, 0xff872033, 0xff86fc93, 0xff86d905, 0xff86b58b, 0xff869223, 0xff866ecd,
0xff864b8a, 0xff86285a, 0xff86053c, 0xff85e231, 0xff85bf37, 0xff859c50, 0xff85797c, 0xff8556b9,
0xff853408, 0xff85116a, 0xff84eedd, 0xff84cc63, 0xff84a9fa, 0xff8487a3, 0xff84655e, 0xff84432a,
0xff842108, 0xff83fef8, 0xff83dcf9, 0xff83bb0c, 0xff839930, 0xff837766, 0xff8355ad, 0xff833405,
0xff83126f, 0xff82f0e9, 0xff82cf75, 0xff82ae12, 0xff828cc0, 0xff826b7f, 0xff824a4e, 0xff82292f,
0xff820821, 0xff81e723, 0xff81c636, 0xff81a559, 0xff81848e, 0xff8163d3, 0xff814328, 0xff81228e,
0xff810204, 0xff80e18b, 0xff80c122, 0xff80a0c9, 0xff808081, 0xff806048, 0xff804020, 0xff802008
};float qsqrt(float x)
{
int n = (*(int*)&x-0x3f800000-0x98000)>>1; // approx. sqrt(x)
//int m = recip8[(n>>15)&0xff]-(n&0xff800000)+0x3f000000;
int m = recip9[(n>>14)&0x1ff]-(n&0xff800000)+0x3f000000;
n += 0x3f000000;
return *(float*)&n + x*(*(float*)&m); // 1 round Babylonian: y=0.5*(y+x/y)
}float qsqrt2(float x)
{
float o = x;
float xhalf = 0.5f*x;
int i = *(int*)&x;
i = 0x5f3759df - (i >> 1);
x = *(float*)&i;
x = x*(1.5f - xhalf*x*x);
return o*x;
}float qsqrt3(float x)
{
*(int*)&x = (((*(int*)&x)-0x3f800000)>>1)+0x3f800000 - 0x4c000;
return x;
}#define MAX_ACCU_CNT 0x1000000
#define MAX_LOOP_CNT 0x1000000int _tmain(int argc, _TCHAR* argv[])
{
// calc accuracy
float e, e1=0.0f, e2=0.0f, e3=0.0f;
float emax1= 0.0f, emax2 = 0.0f, emax3 = 0.0f;
float vmax1=0, vmax2=0, vmax3=0;
int dmax1, dmax2, dmax3; for( int i = 1; i<MAX_ACCU_CNT; ++i )
{
float s = sqrtf(i);
float q = qsqrt(i);
float e = abs(q-s)/s;
e1 += e;
if( e >= emax1 )
{
emax1 = e;
vmax1 = q-s;
dmax1 = i;
}
q = qsqrt2(i);
e = abs(q-s)/s;
e2 += e;
if( e >= emax2 )
{
emax2 = e;
vmax2 = q-s;
dmax2 = i;
}
q = qsqrt3(i);
e = abs(q-s)/s;
e3 += e;
if( e >= emax3 )
{
emax3 = e;
vmax3 = q-s;
dmax3 = i;
}
} e1/=MAX_ACCU_CNT;
e2/=MAX_ACCU_CNT;
e3/=MAX_ACCU_CNT; // calc speed
LARGE_INTEGER p, t, t1, t2, t3; // raise thread priority to highest level
SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);
// variable e used to prevent release optimizer eliminating following loops
e = 0.0f; QueryPerformanceCounter(&p);
for( int i = 0; i <MAX_LOOP_CNT; ++i )
{
e += sqrtf(i);
}
QueryPerformanceCounter(&t);
t.QuadPart -= p.QuadPart; QueryPerformanceCounter(&p);
for( int i = 1; i <MAX_LOOP_CNT; ++i )
{
e += qsqrt(i);
}
QueryPerformanceCounter(&t1);
t1.QuadPart -= p.QuadPart; QueryPerformanceCounter(&p);
for( int i = 1; i <MAX_LOOP_CNT; ++i )
{
e += qsqrt2(i);
}
QueryPerformanceCounter(&t2);
t2.QuadPart -= p.QuadPart; QueryPerformanceCounter(&p);
for( int i = 1; i <MAX_LOOP_CNT; ++i )
{
e += qsqrt3(i);
}
QueryPerformanceCounter(&t3);
t3.QuadPart -= p.QuadPart; SetPriorityClass(GetCurrentProcess(), NORMAL_PRIORITY_CLASS);
SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_NORMAL);
double i1, i2, i3;
i1 = (double)t.QuadPart / t1.QuadPart;
i2 = (double)t.QuadPart / t2.QuadPart;
i3 = (double)t.QuadPart / t3.QuadPart; // passing e to printf() for preventing optimizer eliminating previous testing loops
printf(" SPEEDUP AVG(%%) MAX(%%) @VAL\n\n", e);
printf("via_lut(): %7.2f %10.4f %10.4f %10d(%10.4f%+f)\n", i1, (double)e1*100.0f, (double)emax1*100.0f, dmax1, (double)sqrtf(dmax1), (double)vmax1);
printf("via_inv(): %7.2f %10.4f %10.4f %10d(%10.4f%+f)\n", i2, (double)e2*100.0f, (double)emax2*100.0f, dmax2, (double)sqrtf(dmax2), (double)vmax2);
printf("via_sft(): %7.2f %10.4f %10.4f %10d(%10.4f%+f)\n", i3, (double)e3*100.0f, (double)emax3*100.0f, dmax3, (double)sqrtf(dmax3), (double)vmax3); _getch();
}