6 files changed, 131 insertions, 94 deletions
diff --git a/src/ref/cipher.c b/src/ref/cipher.c
index 8ebbbc3..07405e1 100644
--- a/src/ref/cipher.c
+++ b/src/ref/cipher.c
@@ -148,7 +148,7 @@ void lilliput_tbc_encrypt(
     uint8_t RTK[ROUNDS][ROUND_TWEAKEY_BYTES];
     _compute_round_tweakeys(key, tweak, RTK);
 
-    for (uint8_t i=0; i<ROUNDS-1; i++)
+    for (size_t i=0; i<ROUNDS-1; i++)
     {
         _one_round_egfn(X, RTK[i], PERMUTATION_ENCRYPTION);
     }
@@ -171,7 +171,7 @@ void lilliput_tbc_decrypt(
     uint8_t RTK[ROUNDS][ROUND_TWEAKEY_BYTES];
     _compute_round_tweakeys(key, tweak, RTK);
 
-    for (uint8_t i=0; i<ROUNDS-1; i++)
+    for (size_t i=0; i<ROUNDS-1; i++)
     {
         _one_round_egfn(X, RTK[ROUNDS-1-i], PERMUTATION_DECRYPTION);
     }
diff --git a/src/ref/lilliput-ae-utils.h b/src/ref/lilliput-ae-utils.h
index 0efb776..19b4623 100644
--- a/src/ref/lilliput-ae-utils.h
+++ b/src/ref/lilliput-ae-utils.h
@@ -28,16 +28,6 @@ This file provides functions used by both authenticated encryption modes.
 #include "constants.h"
 
 
-static inline uint8_t upper_nibble(uint8_t i)
-{
-    return i >> 4;
-}
-
-static inline uint8_t lower_nibble(uint8_t i)
-{
-    return i & 0x0f;
-}
-
 static inline void encrypt(const uint8_t K[KEY_BYTES],
                            const uint8_t T[TWEAK_BYTES],
                            const uint8_t M[BLOCK_BYTES],
@@ -68,35 +58,41 @@ static inline void xor_arrays(size_t len, uint8_t out[len], const uint8_t a[len]
 
 static inline void pad10(size_t X_len, const uint8_t X[X_len], uint8_t padded[BLOCK_BYTES])
 {
-    /* pad10*(X) = X || 1 || 0^{n-|X|-1} */
-
-    /* For example, with uint8_t X[3] = { [0]=0x01, [1]=0x02, [2]=0x03 }
+    /* Assuming 0 < |X| < n:
+     *
+     * pad10*(X) = X || 1 || 0^{n-|X|-1}
+     *
+     * For example, with uint8_t X[3] = { [0]=0x01, [1]=0x02, [2]=0x03 }
      *
      * pad10*(X) =
-     *       X[2]     X[1]     X[0]   1 0*
-     *     00000011 00000010 00000001 1 0000000 00000000...
+     *       X[0]     X[1]     X[2]   1 0*
+     *     00000001 00000010 00000011 1 0000000 00000000...
      *
-     * - padded[0, 11]:  zeroes
-     * - padded[12]:     10000000
-     * - padded[13, 15]: X[0, 2]
+     * - padded[0, 2]:  X[0, 2]
+     * - padded[3]:     10000000
+     * - padded[4, 15]: zeroes
      */
 
-    /* Assume that X_len<BLOCK_BYTES. */
+    memcpy(padded, X, X_len);
+    padded[X_len] = 0x80;
 
-    size_t pad_len = BLOCK_BYTES-X_len;
+    /* memset(&padded[BLOCK_BYTES], 0, 0) may or may not constitute
+     * undefined behaviour; use a straight loop instead. */
 
-    memset(padded, 0, pad_len-1);
-    padded[pad_len-1] = 0x80;
-    memcpy(padded+pad_len, X, X_len);
+    for (size_t i=X_len+1; i<BLOCK_BYTES; i++)
+    {
+        padded[i] = 0;
+    }
 }
 
 static inline void copy_block_index(size_t index, uint8_t tweak[TWEAK_BYTES])
 {
-    /* NB: little-endian architectures can simply use:
-     *     memcpy(tweak, &index, sizeof(index)); */
-    for (size_t i=0; i<sizeof(index); i++)
+    size_t s = sizeof(index);
+    uint8_t *dest = &tweak[TWEAK_BYTES-s];
+
+    for (size_t i=0; i<s; i++)
     {
-        tweak[i] = index >> 8*i & 0xff;
+        dest[i] = index >> 8*(s-1-i);
     }
 }
 
@@ -106,19 +102,22 @@ static inline void fill_index_tweak(
     uint8_t tweak[TWEAK_BYTES]
 )
 {
-    /* With an s-bit block index, the t-bit tweak is filled as follows:
+    /* The t-bit tweak is filled as follows:
      *
-     * - bits [  1, t-4]: block index
-     *        [  1,   s]: actual block index
-     *        [s+1, t-4]: 0-padding
-     * - bits [t-3,   t]: 4-bit prefix
+     *   1    4    5         t
+     * [ prefix || block index ]
+     *
+     * The s-bit block index is encoded as follows:
+     *
+     *   5        t-s    t-s+1                t
+     * [ zero padding || block index, MSB first ]
      */
 
-    copy_block_index(block_index, tweak);
+    tweak[0] = prefix<<4;
 
     /* Assume padding bytes have already been set to 0. */
 
-    tweak[TWEAK_BYTES-1] |= prefix << 4;
+    copy_block_index(block_index, tweak);
 }
 
 static void process_associated_data(
diff --git a/src/ref/lilliput-i.c b/src/ref/lilliput-i.c
index 6f869c3..3358b10 100644
--- a/src/ref/lilliput-i.c
+++ b/src/ref/lilliput-i.c
@@ -32,58 +32,71 @@ static const uint8_t _0n[BLOCK_BYTES] = {
 };
 
 
+static uint8_t _upper_nibble(uint8_t i)
+{
+    return i >> 4;
+}
+
+static uint8_t _lower_nibble(uint8_t i)
+{
+    return i & 0x0f;
+}
+
 static void _init_msg_tweak(const uint8_t N[NONCE_BYTES], uint8_t tweak[TWEAK_BYTES])
 {
-    /* With an s-bit block index, the t-bit tweak is filled as follows:
+    /* The t-bit tweak is filled as follows:
+     *
+     *   1    4    5     |N|+4    |N|+5     t
+     * [ prefix ||  nonce      || block index ]
      *
-     * - bits [      1, t-|N|-4]: block index
-     *        [      1,       s]: actual block index
-     *        [    s+1, t-|N|-4]: 0-padding
-     * - bits [t-|N|-3,     t-4]: nonce
-     * - bits [    t-3,       t]: 4-bit prefix
+     * The s-bit block index is encoded as follows:
      *
-     * This function sets bits s+1 to t-4 once and for all.
+     *   |N|+5    t-s    t-s+1                t
+     * [ zero padding || block index, MSB first ]
+     *
+     * This function sets bits 5 to t-s once and for all.
      */
 
-    size_t N_start = TWEAK_BYTES - NONCE_BYTES - 1;
-
-    for (size_t i=sizeof(size_t); i<N_start; i++)
-    {
-        tweak[i] = 0;
-    }
-
-    tweak[N_start] = lower_nibble(N[0]) << 4;
+    tweak[0] = _upper_nibble(N[0]);
 
     for (size_t i=1; i<NONCE_BYTES; i++)
     {
-        tweak[N_start+i] = lower_nibble(N[i]) << 4 ^ upper_nibble(N[i-1]);
+        tweak[i] = _lower_nibble(N[i-1]) << 4 ^ _upper_nibble(N[i]);
     }
 
-    tweak[TWEAK_BYTES-1] = upper_nibble(N[NONCE_BYTES-1]);
+    tweak[NONCE_BYTES] = _lower_nibble(N[NONCE_BYTES-1]) << 4;
+
+    /* The number of bits we need to zero out is:
+     *     t - |N| - s - 4        - 4
+     *                   (prefix)   (zeroed out by previous assignment)
+     */
+    memset(&tweak[NONCE_BYTES+1], 0, TWEAK_BYTES-NONCE_BYTES-sizeof(size_t)-1);
 }
 
 static void _fill_msg_tweak(
-    uint8_t       prefix,
-    size_t        block_index,
-    uint8_t       tweak[TWEAK_BYTES]
+    uint8_t prefix,
+    size_t  block_index,
+    uint8_t tweak[TWEAK_BYTES]
 )
 {
-    /* With an s-bit block index, the t-bit tweak is filled as follows:
+    /* The t-bit tweak is filled as follows:
      *
-     * - bits [      1, t-|N|-4]: block index
-     *        [      1,       s]: actual block index
-     *        [    s+1, t-|N|-4]: 0-padding
-     * - bits [t-|N|-3,     t-4]: nonce
-     * - bits [    t-3,       t]: 4-bit prefix
+     *   1    4    5     |N|+4    |N|+5     t
+     * [ prefix ||  nonce      || block index ]
      *
-     * This function assumes bits s+1 to t-3 have already been set,
-     * and only sets bits 1 to s and t-3 to t.
+     * The s-bit block index is encoded as follows:
+     *
+     *   |N|+5    t-s    t-s+1                t
+     * [ zero padding || block index, MSB first ]
+     *
+     * This function assumes bits 5 to t-s have already been set, and
+     * only sets bits 1 to 4 and t-s+1 to t.
      */
 
-    copy_block_index(block_index, tweak);
+    uint8_t *msb = &tweak[0];
+    *msb = prefix<<4 ^ _lower_nibble(*msb);
 
-    uint8_t *msb = &tweak[TWEAK_BYTES-1];
-    *msb = prefix<<4 ^ lower_nibble(*msb);
+    copy_block_index(block_index, tweak);
 }
 
 static void _encrypt_message(
diff --git a/src/ref/lilliput-ii.c b/src/ref/lilliput-ii.c
index 6811d49..bb43d08 100644
--- a/src/ref/lilliput-ii.c
+++ b/src/ref/lilliput-ii.c
@@ -28,36 +28,42 @@ This file implements Lilliput-AE's nonce-misuse-resistant mode based on SCT-2.
 
 static void _init_msg_tweak(const uint8_t tag[TAG_BYTES], uint8_t tweak[TWEAK_BYTES])
 {
-    /* With an s-bit block index, the t-bit tweak is filled as follows:
+    /* The t-bit tweak is filled as follows:
+     *
+     *   1    2                      t
+     * [ 1 || tag[2,t] XOR block index  ]
+     *
+     * The s-bit block index is XORed to the tag as follows:
      *
-     * - bits [  1, t-1]: tag + block index
-     *        [  1,   s]: tag[1..s] XOR block index
-     *        [s+1, t-1]: tag[s+1..t-1]
-     * - bit t: 1
+     *   2       t-s    t-s+1                                  t
+     * [ tag[2, t-s] || tag[t-s+1, t] XOR block index, MSB first ]
      *
-     * This function sets bits s+1 to t once and for all.
+     * This function sets bits 1 to t-s once and for all.
      */
 
-    memcpy(tweak+sizeof(size_t), tag+sizeof(size_t), TAG_BYTES-sizeof(size_t));
-    tweak[TWEAK_BYTES-1] |= 0x80;
+    memcpy(tweak, tag, TAG_BYTES-sizeof(size_t));
+    tweak[0] |= 0x80;
 }
 
 static void _fill_msg_tweak(const uint8_t tag[TAG_BYTES], size_t block_index, uint8_t tweak[TWEAK_BYTES])
 {
-    /* With an s-bit block index, the t-bit tweak is filled as follows:
+    /* The t-bit tweak is filled as follows:
+     *
+     *   1    2                      t
+     * [ 1 || tag[2,t] XOR block index  ]
+     *
+     * The s-bit block index is XORed to the tag as follows:
      *
-     * - bits [  1, t-1]: tag + block index
-     *        [  1,   s]: tag[1..s] XOR block index
-     *        [s+1, t-1]: tag[s+1..t-1]
-     * - bit t: 1
+     *   2       t-s    t-s+1                                  t
+     * [ tag[2, t-s] || tag[t-s+1, t] XOR block index, MSB first ]
      *
-     * This function assumes bits s+1 to t have already been set, and
-     * only sets bits 1 to s.
+     * This function assumes bits 1 to t-s have already been set, and
+     * only sets bits t-s+1 to t.
      */
 
     copy_block_index(block_index, tweak);
 
-    for (size_t i=0; i<sizeof(block_index); i++)
+    for (size_t i=TWEAK_BYTES-sizeof(size_t); i<TWEAK_BYTES; i++)
     {
         tweak[i] ^= tag[i];
     }
@@ -67,12 +73,12 @@ static void _fill_tag_tweak(const uint8_t N[NONCE_BYTES], uint8_t tweak[TWEAK_BY
 {
     /* The t-bit tweak is filled as follows:
      *
-     * - bits [  1, t-7]: N
-     * - bits [t-7,   t]: 0001||0^4
+     *   1  4    5   8    t-|N|+1     t
+     * [ 0001 ||  0^4  ||        nonce  ]
      */
 
-    memcpy(tweak, N, TWEAK_BYTES-1);
-    tweak[TWEAK_BYTES-1] = 0x10;
+    tweak[0] = 0x10;
+    memcpy(&tweak[1], N, TWEAK_BYTES-1);
 }
 
 static void _generate_tag(
@@ -129,8 +135,8 @@ static void _encrypt_message(
     _init_msg_tweak(tag, tweak);
 
     uint8_t padded_N[BLOCK_BYTES];
-    memcpy(padded_N, N, NONCE_BYTES);
-    padded_N[BLOCK_BYTES-1] = 0;
+    padded_N[0] = 0;
+    memcpy(&padded_N[1], N, NONCE_BYTES);
 
     size_t l = M_len / BLOCK_BYTES;
     size_t rest = M_len % BLOCK_BYTES;
diff --git a/src/ref/multiplications.h b/src/ref/multiplications.h
index 4de1848..c0645b9 100644
--- a/src/ref/multiplications.h
+++ b/src/ref/multiplications.h
@@ -71,6 +71,26 @@ static void _multiply_M3(const uint8_t x[LANE_BYTES], uint8_t y[LANE_BYTES])
     y[0] = x[5];
 }
 
+static void _multiply_M4(const uint8_t x[LANE_BYTES], uint8_t y[LANE_BYTES])
+{
+    uint8_t a_5  = x[5]<<3  ^ x[4];
+    uint8_t a_4  = x[4]>>3  ^ x[3];
+    uint8_t b_5 = a_5<<3 ^ a_4;
+    uint8_t b_4 = a_4>>3 ^ x[2];
+
+    uint8_t c_4 = b_4>>3 ^ x[6]<<2 ^ x[1];
+    uint8_t c_5 = b_5<<3 ^ b_4;
+
+    y[7] = b_5;
+    y[6] = c_5;
+    y[5] = c_5<<3 ^ c_4;
+    y[4] = c_4>>3 ^ x[5]<<2 ^ x[0];
+    y[3] = a_5<<2 ^ x[7];
+    y[2] = b_5<<2 ^ x[6];
+    y[1] = x[5];
+    y[0] = a_5;
+}
+
 static void _multiply_MR(const uint8_t x[LANE_BYTES], uint8_t y[LANE_BYTES])
 {
     y[0] = x[1];
diff --git a/src/ref/tweakey.c b/src/ref/tweakey.c
index 2f357ca..510f35a 100644
--- a/src/ref/tweakey.c
+++ b/src/ref/tweakey.c
@@ -63,10 +63,11 @@ void tweakey_state_extract(
 
 typedef void (*matrix_multiplication)(const uint8_t x[LANE_BYTES], uint8_t y[LANE_BYTES]);
 
-static const matrix_multiplication ALPHAS[6] = {
+static const matrix_multiplication ALPHAS[7] = {
     _multiply_M,
     _multiply_M2,
     _multiply_M3,
+    _multiply_M4,
     _multiply_MR,
     _multiply_MR2,
     _multiply_MR3
@@ -75,15 +76,13 @@ static const matrix_multiplication ALPHAS[6] = {
 
 void tweakey_state_update(uint8_t TK[TWEAKEY_BYTES])
 {
-    /* Skip lane 0, as it is multiplied by the identity matrix. */
-
-    for (size_t j=1; j<LANES_NB; j++)
+    for (size_t j=0; j<LANES_NB; j++)
     {
         uint8_t *TKj = TK + j*LANE_BYTES;
 
         uint8_t TKj_old[LANE_BYTES];
         memcpy(TKj_old, TKj, LANE_BYTES);
 
-        ALPHAS[j-1](TKj_old, TKj);
+        ALPHAS[j](TKj_old, TKj);
     }
 }