lilliput-ae-reference-implementation

Implementations of Lilliput-AE submitted to the NIST LWC standardization process
git clone https://git.kevinlegouguec.net/lilliput-ae-reference-implementation
Log | Files | Refs | README

traces-tbc.patch (11917B)


      1 diff --git a/src/add_threshold/cipher.c b/src/add_threshold/cipher.c
      2 index 778a100..3b49db5 100644
      3 --- a/src/add_threshold/cipher.c
      4 +++ b/src/add_threshold/cipher.c
      5 @@ -25,6 +25,8 @@ throughout the entire round function in order to avoid extra randomness
      6  generation to switch from 2 shares to 3 shares and vice versa.
      7  */
      8  
      9 +#include "debug.h"
     10 +
     11  #include <stdint.h>
     12  #include <string.h>
     13  
     14 @@ -100,6 +102,8 @@ static void _state_init(
     15      uint8_t SHARES_1[BLOCK_BYTES];
     16      randombytes(sizeof(SHARES_0), SHARES_0);
     17      randombytes(sizeof(SHARES_1), SHARES_1);
     18 +    debug_dump_buffer("SHARES_0", sizeof(SHARES_0), SHARES_0, 8);
     19 +    debug_dump_buffer("SHARES_1", sizeof(SHARES_1), SHARES_1, 8);
     20  
     21      memcpy(X, SHARES_0, BLOCK_BYTES);
     22      memcpy(Y, SHARES_1, BLOCK_BYTES);
     23 @@ -117,15 +121,25 @@ static void _compute_round_tweakeys(
     24      uint8_t RTK_Y[ROUNDS][ROUND_TWEAKEY_BYTES]
     25  )
     26  {
     27 +    fprintf(DUMP, "computing %zu round sub-tweakeys\n", (size_t)ROUNDS);
     28 +
     29      uint8_t TK_X[TWEAKEY_BYTES];
     30      uint8_t TK_Y[TWEAKEY_BYTES];
     31      tweakey_state_init(TK_X, TK_Y, key, tweak);
     32      tweakey_state_extract(TK_X, TK_Y, 0, RTK_X[0], RTK_Y[0]);
     33  
     34 +    fprintf(DUMP, "    0\n");
     35 +    debug_dump_buffer("RTK_X", ROUND_TWEAKEY_BYTES, RTK_X[0], 8);
     36 +    debug_dump_buffer("RTK_Y", ROUND_TWEAKEY_BYTES, RTK_Y[0], 8);
     37 +
     38      for (size_t i=1; i<ROUNDS; i++)
     39      {
     40          tweakey_state_update(TK_X, TK_Y);
     41 +        debug_dump_buffer("TK_X", TWEAKEY_BYTES, TK_X, 8);
     42 +        debug_dump_buffer("TK_Y", TWEAKEY_BYTES, TK_Y, 8);
     43          tweakey_state_extract(TK_X, TK_Y, i, RTK_X[i], RTK_Y[i]);
     44 +        debug_dump_buffer("RTK_X", ROUND_TWEAKEY_BYTES, RTK_X[i], 8);
     45 +        debug_dump_buffer("RTK_Y", ROUND_TWEAKEY_BYTES, RTK_Y[i], 8);
     46      }
     47  }
     48  
     49 @@ -138,6 +152,12 @@ static void _nonlinear_layer(
     50      const uint8_t RTK_Y[ROUND_TWEAKEY_BYTES]
     51  )
     52  {
     53 +    fprintf(DUMP, "        nonlinear layer\n");
     54 +
     55 +    debug_dump_buffer("X", BLOCK_BYTES, X, 12);
     56 +    debug_dump_buffer("Y", BLOCK_BYTES, Y, 12);
     57 +    debug_dump_buffer("Z", BLOCK_BYTES, Z, 12);
     58 +
     59      uint8_t x_hi, y_hi, z_hi;   // High nibbles for the Feistel network
     60      uint8_t x_lo, y_lo, z_lo;   // Low nibbles for the Feistel network
     61      uint8_t tmp0, tmp1, tmp2;
     62 @@ -152,9 +172,14 @@ static void _nonlinear_layer(
     63          TMP_Y[j] = Y[j] ^ RTK_Y[j];
     64      }
     65  
     66 +    debug_dump_buffer("Xj XOR RTK_Xj", sizeof(TMP_X), TMP_X, 12);
     67 +    debug_dump_buffer("Yj XOR RTK_Yj", sizeof(TMP_Y), TMP_Y, 12);
     68 +
     69      // Threshold Implementation of the 8-bit S-box
     70      for (size_t j=0; j<ROUND_TWEAKEY_BYTES; j++)
     71      {
     72 +        fprintf(DUMP, "        S-box (%zu/%zu)\n", j+1, (size_t)ROUND_TWEAKEY_BYTES);
     73 +
     74          // Decomposition into nibbles
     75          x_hi = TMP_X[j] >> 4;
     76          x_lo = TMP_X[j] & 0xf;
     77 @@ -162,20 +187,54 @@ static void _nonlinear_layer(
     78          y_lo = TMP_Y[j] & 0xf;
     79          z_hi = Z[j] >> 4;
     80          z_lo = Z[j] & 0xf;
     81 +
     82 +        fprintf(DUMP, "            x_hi: %u\n", x_hi);
     83 +        fprintf(DUMP, "            x_lo: %u\n", x_lo);
     84 +        fprintf(DUMP, "            y_hi: %u\n", y_hi);
     85 +        fprintf(DUMP, "            y_lo: %u\n", y_lo);
     86 +        fprintf(DUMP, "            z_hi: %u\n", z_hi);
     87 +        fprintf(DUMP, "            z_lo: %u\n", z_lo);
     88 +
     89          // First 4-bit S-box
     90 +        fprintf(DUMP, "            First 4-bit S-box\n");
     91 +
     92          tmp0 = G[(y_lo&7)>>1][z_lo];
     93          tmp1 = G[(z_lo&7)>>1][x_lo];
     94          tmp2 = G[(x_lo&7)>>1][y_lo];
     95          x_hi ^= F[tmp1][tmp2];
     96          y_hi ^= F[tmp2][tmp0];
     97          z_hi ^= F[tmp0][tmp1];
     98 +
     99 +        fprintf(DUMP, "            tmp0: %u\n", tmp0);
    100 +        fprintf(DUMP, "            tmp1: %u\n", tmp1);
    101 +        fprintf(DUMP, "            tmp2: %u\n", tmp2);
    102 +        fprintf(DUMP, "            x_hi: %u\n", x_hi);
    103 +        fprintf(DUMP, "            y_hi: %u\n", y_hi);
    104 +        fprintf(DUMP, "            z_hi: %u\n", z_hi);
    105 +
    106          // Second 4-bit S-box
    107 +        fprintf(DUMP, "            First 4-bit S-box\n");
    108 +
    109          tmp0 = P[Q[y_hi&3 ^ (y_hi&8)>>1][z_hi]];
    110          tmp1 = P[Q[z_hi&3 ^ (z_hi&8)>>1][x_hi]];
    111          tmp2 = P[Q[x_hi&3 ^ (x_hi&8)>>1][y_hi]];
    112          x_lo ^= Q[tmp1&3 ^ (tmp1&8)>>1][tmp2];
    113          y_lo ^= Q[tmp2&3 ^ (tmp2&8)>>1][tmp0];
    114          z_lo ^= Q[tmp0&3 ^ (tmp0&8)>>1][tmp1];
    115 +
    116 +        fprintf(DUMP, "            y_hi&3 ^ (y_hi&8)>>1: %u\n", y_hi&3 ^ (y_hi&8)>>1);
    117 +        fprintf(DUMP, "            z_hi&3 ^ (z_hi&8)>>1: %u\n", z_hi&3 ^ (z_hi&8)>>1);
    118 +        fprintf(DUMP, "            x_hi&3 ^ (x_hi&8)>>1: %u\n", x_hi&3 ^ (x_hi&8)>>1);
    119 +        fprintf(DUMP, "            Q[y_hi&3 ^ (y_hi&8)>>1][z_hi]: %u\n", Q[y_hi&3 ^ (y_hi&8)>>1][z_hi]);
    120 +        fprintf(DUMP, "            Q[z_hi&3 ^ (z_hi&8)>>1][x_hi]: %u\n", Q[z_hi&3 ^ (z_hi&8)>>1][x_hi]);
    121 +        fprintf(DUMP, "            Q[x_hi&3 ^ (x_hi&8)>>1][y_hi]: %u\n", Q[x_hi&3 ^ (x_hi&8)>>1][y_hi]);
    122 +        fprintf(DUMP, "            tmp0: %u\n", tmp0);
    123 +        fprintf(DUMP, "            tmp1: %u\n", tmp1);
    124 +        fprintf(DUMP, "            tmp2: %u\n", tmp2);
    125 +        fprintf(DUMP, "            x_lo: %u\n", x_lo);
    126 +        fprintf(DUMP, "            y_lo: %u\n", y_lo);
    127 +        fprintf(DUMP, "            z_lo: %u\n", z_lo);
    128 +
    129          // Third 4-bit S-box
    130          tmp0 = G[(y_lo&7)>>1][z_lo] ^ 1;
    131          tmp1 = G[(z_lo&7)>>1][x_lo];
    132 @@ -183,12 +242,28 @@ static void _nonlinear_layer(
    133          x_hi ^= F[tmp1][tmp2];
    134          y_hi ^= F[tmp2][tmp0];
    135          z_hi ^= F[tmp0][tmp1];
    136 +
    137 +        fprintf(DUMP, "            tmp0: %u\n", tmp0);
    138 +        fprintf(DUMP, "            tmp1: %u\n", tmp1);
    139 +        fprintf(DUMP, "            tmp2: %u\n", tmp2);
    140 +        fprintf(DUMP, "            x_hi: %u\n", x_hi);
    141 +        fprintf(DUMP, "            y_hi: %u\n", y_hi);
    142 +        fprintf(DUMP, "            z_hi: %u\n", z_hi);
    143 +
    144          // Build bytes from nibbles
    145          TMP_X[j] = (x_hi << 4 | x_lo);
    146          TMP_Y[j] = (y_hi << 4 | y_lo);
    147          TMP_Z[j] = (z_hi << 4 | z_lo);
    148 +
    149 +        debug_dump_buffer("TMP_X", sizeof(TMP_X), TMP_X, 12);
    150 +        debug_dump_buffer("TMP_Y", sizeof(TMP_Y), TMP_Y, 12);
    151 +        debug_dump_buffer("TMP_Z", sizeof(TMP_Z), TMP_Z, 12);
    152      }
    153  
    154 +    debug_dump_buffer("TMP_X (post-S-box)", sizeof(TMP_X), TMP_X, 12);
    155 +    debug_dump_buffer("TMP_Y (post-S-box)", sizeof(TMP_Y), TMP_Y, 12);
    156 +    debug_dump_buffer("TMP_Z (post-S-box)", sizeof(TMP_Z), TMP_Z, 12);
    157 +
    158      for (size_t j=0; j<8; j++)
    159      {
    160          size_t dest_j = 15-j;
    161 @@ -196,10 +271,16 @@ static void _nonlinear_layer(
    162          Y[dest_j] ^= TMP_Y[j];
    163          Z[dest_j] ^= TMP_Z[j];
    164      }
    165 +
    166 +    debug_dump_buffer("X (post-XOR)", BLOCK_BYTES, X, 12);
    167 +    debug_dump_buffer("Y (post-XOR)", BLOCK_BYTES, Y, 12);
    168 +    debug_dump_buffer("Z (post-XOR)", BLOCK_BYTES, Z, 12);
    169  }
    170  
    171  static void _linear_layer(uint8_t X[BLOCK_BYTES])
    172  {
    173 +    fprintf(DUMP, "        linear layer\n");
    174 +
    175      X[15] ^= X[1];
    176      X[15] ^= X[2];
    177      X[15] ^= X[3];
    178 @@ -214,6 +295,8 @@ static void _linear_layer(uint8_t X[BLOCK_BYTES])
    179      X[11] ^= X[7];
    180      X[10] ^= X[7];
    181      X[9]  ^= X[7];
    182 +
    183 +    debug_dump_buffer("X", BLOCK_BYTES, X, 12);
    184  }
    185  
    186  static void _permutation_layer(uint8_t X[BLOCK_BYTES], permutation p)
    187 @@ -223,6 +306,8 @@ static void _permutation_layer(uint8_t X[BLOCK_BYTES], permutation p)
    188          return;
    189      }
    190  
    191 +    fprintf(DUMP, "        permutation layer\n");
    192 +
    193      uint8_t X_old[BLOCK_BYTES];
    194      memcpy(X_old, X, BLOCK_BYTES);
    195  
    196 @@ -232,6 +317,8 @@ static void _permutation_layer(uint8_t X[BLOCK_BYTES], permutation p)
    197      {
    198          X[pi[j]] = X_old[j];
    199      }
    200 +
    201 +    debug_dump_buffer("X", BLOCK_BYTES, X, 12);
    202  }
    203  
    204  static void _one_round_egfn(
    205 @@ -270,11 +357,15 @@ void lilliput_tbc_encrypt(
    206      _compute_round_tweakeys(key, tweak, RTK_X, RTK_Y);
    207  
    208  
    209 +    fprintf(DUMP, "running EGFN %zu times\n", (size_t)ROUNDS);
    210 +
    211      for (size_t i=0; i<ROUNDS-1; i++)
    212      {
    213 +        fprintf(DUMP, "    round %zu\n", (size_t)i);
    214          _one_round_egfn(X, Y, Z, RTK_X[i], RTK_Y[i], PERMUTATION_ENCRYPTION);
    215      }
    216  
    217 +    fprintf(DUMP, "    round %zu\n", (size_t)(ROUNDS-1));
    218      _one_round_egfn(X, Y, Z, RTK_X[ROUNDS-1], RTK_Y[ROUNDS-1], PERMUTATION_NONE);
    219  
    220  
    221 diff --git a/src/add_threshold/random.c b/src/add_threshold/random.c
    222 index a966a8e..8d5f2cc 100644
    223 --- a/src/add_threshold/random.c
    224 +++ b/src/add_threshold/random.c
    225 @@ -21,6 +21,8 @@ This file provides a system-specific function to generate random bytes.
    226  
    227  #define _GNU_SOURCE
    228  
    229 +#include "debug.h"
    230 +
    231  #include <stddef.h>
    232  #include <stdint.h>
    233  
    234 @@ -32,5 +34,6 @@ This file provides a system-specific function to generate random bytes.
    235  
    236  void randombytes(size_t nb, uint8_t out[nb])
    237  {
    238 -    syscall(SYS_getrandom, out, nb, 0);
    239 +    for (size_t i=0; i<nb; i++)
    240 +        out[i] = i;
    241  }
    242 diff --git a/src/add_threshold/tweakey.c b/src/add_threshold/tweakey.c
    243 index 7822564..e1abbb6 100644
    244 --- a/src/add_threshold/tweakey.c
    245 +++ b/src/add_threshold/tweakey.c
    246 @@ -20,6 +20,8 @@ This file provides a first-order threshold implementation of Lilliput-TBC's
    247  tweakey schedule, where the tweak and the key are split into two shares.
    248  */
    249  
    250 +#include "debug.h"
    251 +
    252  #include <stdint.h>
    253  #include <string.h>
    254  
    255 @@ -43,6 +45,7 @@ void tweakey_state_init(
    256  {
    257      uint8_t SHARES_0[KEY_BYTES];
    258      randombytes(sizeof(SHARES_0), SHARES_0);
    259 +    debug_dump_buffer("SHARES_0", sizeof(SHARES_0), SHARES_0, 8);
    260  
    261      memcpy(TK_Y, SHARES_0, KEY_BYTES);
    262      memcpy(TK_X, tweak, TWEAK_BYTES);
    263 @@ -68,20 +71,32 @@ void tweakey_state_extract(
    264      {
    265          const uint8_t *TKj_X = TK_X + j*LANE_BYTES;
    266  
    267 +        fprintf(DUMP, "        XORing lane %zu/%zu (RTK_X)\n", 1+j, (size_t)LANES_NB);
    268 +        debug_dump_buffer("RTK_X", ROUND_TWEAKEY_BYTES, round_tweakey_X, 12);
    269 +        debug_dump_buffer("lane[j]", LANE_BYTES, TKj_X, 12);
    270 +
    271          for (size_t k=0; k<LANE_BYTES; k++)
    272          {
    273              round_tweakey_X[k] ^= TKj_X[k];
    274          }
    275 +
    276 +        debug_dump_buffer("=> RTK_X", ROUND_TWEAKEY_BYTES, round_tweakey_X, 12);
    277      }
    278  
    279      for (size_t j=0; j<KEY_LANES_NB; j++)
    280      {
    281          const uint8_t *TKj_Y = TK_Y + j*LANE_BYTES;
    282  
    283 +        fprintf(DUMP, "        XORing lane %zu/%zu (RTK_Y)\n", 1+j, (size_t)LANES_NB);
    284 +        debug_dump_buffer("RTK_Y", ROUND_TWEAKEY_BYTES, round_tweakey_Y, 12);
    285 +        debug_dump_buffer("lane[j]", LANE_BYTES, TKj_Y, 12);
    286 +
    287          for (size_t k=0; k<LANE_BYTES; k++)
    288          {
    289              round_tweakey_Y[k] ^= TKj_Y[k];
    290          }
    291 +
    292 +        debug_dump_buffer("=> RTK_Y", ROUND_TWEAKEY_BYTES, round_tweakey_Y, 12);
    293      }
    294  
    295      round_tweakey_X[0] ^= round_constant;
    296 @@ -100,6 +115,10 @@ static const matrix_multiplication ALPHAS[7] = {
    297      _multiply_MR3
    298  };
    299  
    300 +static char const * const ALPHAS_STR[7] = {
    301 +    "M", "M²", "M³", "M⁴", "MR", "MR²", "MR³"
    302 +};
    303 +
    304  
    305  void tweakey_state_update(uint8_t TK_X[TWEAKEY_BYTES], uint8_t TK_Y[KEY_BYTES])
    306  {
    307 @@ -111,6 +130,10 @@ void tweakey_state_update(uint8_t TK_X[TWEAKEY_BYTES], uint8_t TK_Y[KEY_BYTES])
    308          memcpy(TKj_old_X, TKj_X, LANE_BYTES);
    309  
    310          ALPHAS[j](TKj_old_X, TKj_X);
    311 +
    312 +        fprintf(DUMP, "        multiplying lane %zu/%zu by %s\n", 1+j, (size_t)LANES_NB, ALPHAS_STR[j]);
    313 +        debug_dump_buffer("TK_j_X^i-1", LANE_BYTES, TKj_old_X, 12);
    314 +        debug_dump_buffer("TK_j_X^i", LANE_BYTES, TKj_X, 12);
    315      }
    316  
    317      for (size_t j=0; j<KEY_LANES_NB; j++)
    318 @@ -125,5 +148,11 @@ void tweakey_state_update(uint8_t TK_X[TWEAKEY_BYTES], uint8_t TK_Y[KEY_BYTES])
    319  
    320          ALPHAS[j + TWEAK_LANES_NB](TKj_X_old, TKj_X);
    321          ALPHAS[j + TWEAK_LANES_NB](TKj_Y_old, TKj_Y);
    322 +
    323 +        fprintf(DUMP, "        multiplying lane %zu/%zu by %s\n", 1+j + TWEAK_LANES_NB, (size_t)LANES_NB, ALPHAS_STR[j + TWEAK_LANES_NB]);
    324 +        debug_dump_buffer("TK_j_X^i-1", LANE_BYTES, TKj_X_old, 12);
    325 +        debug_dump_buffer("TK_j_X^i", LANE_BYTES, TKj_X, 12);
    326 +        debug_dump_buffer("TK_j_Y^i-1", LANE_BYTES, TKj_Y_old, 12);
    327 +        debug_dump_buffer("TK_j_Y^i", LANE_BYTES, TKj_Y, 12);
    328      }
    329  }