铜锁 SM2 算法性能优化实践（二）｜快速模约减算法实现

08/02 16:00

算法实现

/*
* /* Pre-computed tables are "carry-less" values of modulus*(i+1),
* all values are in little-endian format.
*/
static const BN_ULONG _sm2_p_256[][BN_SM2_256_TOP] = {
{0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFF00000000ull,
0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFEFFFFFFFFull},/*p*/
{0xFFFFFFFFFFFFFFFEull, 0xFFFFFFFE00000001ull,
0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFDFFFFFFFFull},/*2p*/
...
{0xFFFFFFFFFFFFFFF4ull, 0xFFFFFFF40000000Bull,
0xFFFFFFFFFFFFFFFFull, 0xFFFFFFF3FFFFFFFFull},/*12p*/
{0xFFFFFFFFFFFFFFF3ull, 0xFFFFFFF30000000Cull,
0xFFFFFFFFFFFFFFFFull, 0xFFFFFFF2FFFFFFFFull}/*13p*/
};


/* pre-compute the value of p^2 check if the input satisfies input < p^2. */
static const BN_ULONG _sm2_p_256_sqr[] = {
0x0000000000000001ULL, 0x00000001FFFFFFFEULL,
0xFFFFFFFE00000001ULL, 0x0000000200000000ULL,
0xFFFFFFFDFFFFFFFEULL, 0xFFFFFFFE00000003ULL,
0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFE00000000ULL
};


    	SM2_INT64 acc;         /* accumulator */

acc = rp[0];
acc += bp[8 - 8];
acc += bp[9 - 8];
acc += bp[10 - 8];
acc += bp[11 - 8];
acc += bp[12 - 8];
acc += bp[13 - 8];
acc += bp[14 - 8];
acc += bp[15 - 8];
acc += bp[13 - 8];
acc += bp[14 - 8];
acc += bp[15 - 8];	/*累加高位参数约化结果*/
rp[0] = (unsigned int)acc; /*获得结果的低32位*/
acc >>= 32; /*进位部分右移32位，作为v[1]输入的一部分*/


        /*
* r_d += s2 + s6 + s7 + s8 + s9
* s2 = (c15, c14, c13, c12, c11, 0, c9, c8)
*/
sm2_set_256(t_d, buf.bn, 15, 14, 13, 12, 11, 0, 9, 8);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_SM2_256_TOP);
/*
* s6 = (c11, c11, c10, c15, c14, 0, c13, c12)
*/
sm2_set_256(t_d, buf.bn, 11, 11, 10, 15, 14, 0, 13, 12);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_SM2_256_TOP);
/*
* s7 = (c10, c15, c14, c13, c12, 0, c11, c10)
*/
sm2_set_256(t_d, buf.bn, 10, 15, 14, 13, 12, 0, 11, 10);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_SM2_256_TOP);
/*
* s8 = (c9, 0, 0, c9, c8, 0, c10, c9)
*/
sm2_set_256(t_d, buf.bn, 9, 0, 0, 9, 8, 0, 10, 9);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_SM2_256_TOP);
/*
* s9 = (c8, 0, 0, 0, c15, 0, c12, c11)
*/
sm2_set_256(t_d, buf.bn, 8, 0, 0, 0, 15, 0, 12, 11);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_SM2_256_TOP);


    mask =
0 - (PTR_SIZE_INT) (*u.f) (c_d, r_d, _sm2_p_256[0], BN_SM2_256_TOP);
mask &= 0 - (PTR_SIZE_INT) carry;
res = c_d;
res = (BN_ULONG *)(((PTR_SIZE_INT) res & ~mask) |
sm2_cp_bn(r_d, res, BN_SM2_256_TOP);
r->top = BN_SM2_256_TOP;
bn_correct_top(r);


测试

正确性测试

    /*
* Randomly generate two numbers in the prime field and multiply them，then compare the results of fast modular reduction and conventional algorithms.
*/
for (i = 0; i < 100; i++){
if (!TEST_true(BN_priv_rand_range_ex(a, p, 0, ctx))
|| !TEST_true(BN_priv_rand_range_ex(b, p, 0, ctx))
|| !TEST_true(BN_mul(ab_fast, a, b, ctx))
|| !TEST_true(BN_sm2_mod_256(ab_fast, ab_fast, p, ctx))
|| !TEST_true(BN_mod_mul(ab, a, b, p, ctx))) {
goto done;
}

if (!TEST_int_eq(BN_ucmp(ab, ab_fast), 0)){
goto done;
}
}


make test TESTS="test_mod_sm2"


➜  Tongsuo git:(sm2p256) ✗ make test TESTS="test_mod_sm2"
/Library/Developer/CommandLineTools/usr/bin/make depend && /Library/Developer/CommandLineTools/usr/bin/make _tests
10-test_mod_sm2.t .. ok
All tests successful.
Files=1, Tests=1,  1 wallclock secs ( 0.00 usr  0.01 sys +  0.07 cusr  0.02 csys =  0.10 CPU)
Result: PASS


文中链接

[1]白国强等人提出的快速模约减算法：https://ieeexplore.ieee.org/document/7011249

0 评论
0 收藏
0