// ************************************************************************** // // // // eses eses // // eses eses // // eses eseses esesese eses Embedded Systems Group // // ese ese ese ese ese // // ese eseseses eseseses ese Department of Computer Science // // eses eses ese eses // // eses eseses eseseses eses University of Kaiserslautern // // eses eses // // // // ************************************************************************** // [4]nat x; [4]nat y; [8]nat p; thread RadixBMulDadda { nat t0,t1,t10,t100,t101,t102,t103,t104,t105,t106,t107,t108,t109,t11,t110,t111,t112,t113,t114,t115,t116,t117,t118,t119,t12,t120,t13,t14,t15,t16,t17,t18,t19,t2,t20,t21,t22,t23,t24,t25,t26,t27,t28,t29,t3,t30,t31,t32,t33,t34,t35,t36,t37,t38,t39,t4,t40,t41,t42,t43,t44,t45,t46,t47,t48,t49,t5,t50,t51,t52,t53,t54,t55,t56,t57,t58,t59,t6,t60,t61,t62,t63,t64,t65,t66,t67,t68,t69,t7,t70,t71,t72,t73,t74,t75,t76,t77,t78,t79,t8,t80,t81,t82,t83,t84,t85,t86,t87,t88,t89,t9,t90,t91,t92,t93,t94,t95,t96,t97,t98,t99; bool t121,t122,t123,t124,t125,t126,t127,t128,t129,t130,t131,t132,t133,t134; // compute partial products t0 = x[0] * y[0]; t1 = t0 / 256; t2 = t0 % 256; t3 = x[0] * y[1]; t4 = t3 / 256; t5 = t3 % 256; t6 = x[1] * y[0]; t7 = t6 / 256; t8 = t6 % 256; t9 = x[0] * y[2]; t10 = t9 / 256; t11 = t9 % 256; t12 = x[1] * y[1]; t13 = t12 / 256; t14 = t12 % 256; t15 = x[2] * y[0]; t16 = t15 / 256; t17 = t15 % 256; t18 = x[0] * y[3]; t19 = t18 / 256; t20 = t18 % 256; t21 = x[1] * y[2]; t22 = t21 / 256; t23 = t21 % 256; t24 = x[2] * y[1]; t25 = t24 / 256; t26 = t24 % 256; t27 = x[3] * y[0]; t28 = t27 / 256; t29 = t27 % 256; t30 = x[1] * y[3]; t31 = t30 / 256; t32 = t30 % 256; t33 = x[2] * y[2]; t34 = t33 / 256; t35 = t33 % 256; t36 = x[3] * y[1]; t37 = t36 / 256; t38 = t36 % 256; t39 = x[2] * y[3]; t40 = t39 / 256; t41 = t39 % 256; t42 = x[3] * y[2]; t43 = t42 / 256; t44 = t42 % 256; t45 = x[3] * y[3]; t46 = t45 / 256; t47 = t45 % 256; // reduce heights of each column to 6 t48 = t29 + t26; t49 = t48 / 256; t50 = t48 % 256; t51 = t38 + t35 + t32; t52 = t51 / 256; t53 = t51 % 256; // reduce heights of each column to 4 t54 = t17 + t14; t55 = t54 / 256; t56 = t54 % 256; t57 = t13 + t10; t58 = t57 / 256; t59 = t57 % 256; t60 = t23 + t20 + t16; t61 = t60 / 256; t62 = t60 % 256; t63 = t19 + t49 + t53; t64 = t63 / 256; t65 = t63 % 256; t66 = t28 + t25 + t22; t67 = t66 / 256; t68 = t66 % 256; t69 = t34 + t31 + t52; t70 = t69 / 256; t71 = t69 % 256; t72 = t44 + t41 + t37; t73 = t72 / 256; t74 = t72 % 256; t75 = t47 + t43; t76 = t75 / 256; t77 = t75 % 256; // reduce heights of each column to 3 t78 = t11 + t7; t79 = t78 / 256; t80 = t78 % 256; t81 = t50 + t55 + t59; t82 = t81 / 256; t83 = t81 % 256; t84 = t58 + t61 + t65; t85 = t84 / 256; t86 = t84 % 256; t87 = t64 + t67 + t71; t88 = t87 / 256; t89 = t87 % 256; t90 = t40 + t70 + t73; t91 = t90 / 256; t92 = t90 % 256; // reduce heights of each column to 2 t93 = t8 + t5; t94 = t93 / 256; t95 = t93 % 256; t96 = t4 + t56 + t80; t97 = t96 / 256; t98 = t96 % 256; t99 = t62 + t79 + t83; t100 = t99 / 256; t101 = t99 % 256; t102 = t68 + t82 + t86; t103 = t102 / 256; t104 = t102 % 256; t105 = t74 + t85 + t89; t106 = t105 / 256; t107 = t105 % 256; t108 = t77 + t88 + t92; t109 = t108 / 256; t110 = t108 % 256; t111 = t46 + t76 + t91; t112 = t111 / 256; t113 = t111 % 256; // preliminary addition of the two remaining numbers t114 = t1 + t95; t115 = t94 + t98; t116 = t97 + t101; t117 = t100 + t104; t118 = t103 + t107; t119 = t106 + t110; t120 = t109 + t113; // compute generate and propagate pairs t121 = t114 > 255; t122 = t114 == 255; t123 = t115 > 255; t124 = t115 == 255; t125 = t116 > 255; t126 = t116 == 255; t127 = t117 > 255; t128 = t117 == 255; t129 = t118 > 255; t130 = t118 == 255; t131 = t119 > 255; t132 = t119 == 255; t133 = t120 > 255; t134 = t120 == 255; // parallel prefix tree for computing carry bits // up-level 1 t123 = t124 & t121 | t123; t124 = t124 & t122; t127 = t128 & t125 | t127; t128 = t128 & t126; t131 = t132 & t129 | t131; t132 = t132 & t130; // up-level 2 t127 = t128 & t123 | t127; t128 = t128 & t124; // down-level 4 t131 = t132 & t127 | t131; t132 = t132 & t128; // down-level 5 t133 = t134 & t131 | t133; t134 = t134 & t132; t125 = t126 & t123 | t125; t126 = t126 & t124; t133 = t134 & t131 | t133; t134 = t134 & t132; t129 = t130 & t127 | t129; t130 = t130 & t128; t133 = t134 & t131 | t133; t134 = t134 & t132; // compute final sum digits as the digits of the product t120 = t120+(t131?1:0); t119 = t119+(t129?1:0); t118 = t118+(t127?1:0); t117 = t117+(t125?1:0); t116 = t116+(t123?1:0); t115 = t115+(t121?1:0); // get the product digits p[0] = t2; p[1] = t114 % 256; p[2] = t115 % 256; p[3] = t116 % 256; p[4] = t117 % 256; p[5] = t118 % 256; p[6] = t119 % 256; p[7] = t120 % 256; }