/* C supports multi-character character literals, which are especially useful for switch-case labels, but they have implementation-specific semantics, which makes them nonportable for direct use in I/O, which is where you’d most want to use them. [C99 §6.4.4.4p10](https://stackoverflow.com/questions/3960954/why-do-multicharacter-literals-exist-in-c-and-c) says: > The value of an integer character constant containing more than one character (e.g., 'ab'), or containing a character or escape sequence that does not map to a single-byte execution character, is implementation-defined. The StackOverflow answer also suggests using them for enums to make memory dumps easier to read. In GCC on this amd64 they are 32-bit integers in, surprisingly, big-endian format. Since the machine is little-endian, this probably means that big-endian multi-character character literals *are* portable across GCC platforms. But we can do better! It turns out the preprocessor can do bitwise arithmetic on multi-character character literals, so if the compiler implements multi-character characters in one of a known set of reasonable ways, we can identify it with the preprocessor and give the multi-character character literal some kind of well-defined behavior. In this case I’m choosing little-endian behavior just so that on my machine the case I test is the hairier one. It turns out to be more efficient on little-endian hardware, too, because no byte-swapping is needed. */ #include #include #if ('abcd' & 0xff) == 'd' /* beware of C’s notorious precedence bug here */ #define little_endian_char(c) (((c) >> 24) \ | ((c) >> 8 & 0xff00) \ | (((c) & 0xff00) << 8) \ | (((c) & 0xff) << 24)) #elif ('abcd' & 0xff) == 'a' #define little_endian_char(c) (c) #else /* You could add more cases here if your compiler does something reasonable */ #error "Can’t figure out the multi-character character literal semantics" #endif extern long multicharacter_literal_too_long(char *s); int main(int argc, char **argv) { printf("abcde = %010x\n", 'abcde'); /* gives two warnings; disable one with -Wno-multichar in CFLAGS */ printf("abcd little-endian = %010x\n", little_endian_char('abcd')); /* Example of using it for switch-case labels: */ if (argc == 2 && strlen(argv[1]) == 4) { unsigned char *p = (unsigned char*)argv[1]; int c = p[0] | p[1] << 8 | p[2] << 16 | p[3] << 24; /* little-endian */ switch(c) { case little_endian_char('za3k'): printf("You typed za3k\n"); break; case little_endian_char('PICT'): printf("It’s a PICT\n"); break; default: printf("You said 0x%08x!\n", c); break; } } /* With GCC 12.2.0 with -Os, the above conditional and switch code compiles as follows: 109e: 83 fd 02 cmp $0x2,%ebp # argc == 2? 10a1: 75 49 jne 10ec # Skip past switch if so. 10a3: 48 8b 5b 08 mov 0x8(%rbx),%rbx # Load argv[1] into callee-saved register and 10a7: 48 89 df mov %rbx,%rdi # pass as sole argument 10aa: e8 91 ff ff ff call 1040 # to strlen; 10af: 48 83 f8 04 cmp $0x4,%rax # if return value not 4, 10b3: 75 37 jne 10ec # skip past switch. 10b5: 8b 33 mov (%rbx),%esi # This is the whole little-endian decoding line! 10b7: 81 fe 50 49 43 54 cmp $0x54434950,%esi # Compare to 'PICT'; if there’s a match, 10bd: 74 11 je 10d0 # jump to PICT-handling code. 10bf: 48 8d 3d 69 0f 00 00 lea 0xf69(%rip),%rdi # Load "You typed za3k" speculatively, then 10c6: 81 fe 7a 61 33 6b cmp $0x6b33617a,%esi # compare to 'za3k', and on match, 10cc: 74 09 je 10d7 # skip to puts call. 10ce: eb 0e jmp 10de # If no match, invoke the printf case. 10d0: 48 8d 3d 67 0f 00 00 lea 0xf67(%rip),%rdi # Load "It’s a PICT", and then 10d7: e8 54 ff ff ff call 1030 # call puts() (note incoming branch here). 10dc: eb 0e jmp 10ec # Then leave the switch. 10de: 48 8d 3d 67 0f 00 00 lea 0xf67(%rip),%rdi # Load printf format string, 10e5: 31 c0 xor %eax,%eax # and I think this is a variadic argument count? 10e7: e8 64 ff ff ff call 1050 # Then format the argument already in %esi. The key thing to note here is that each comparison is just a single cmp/je pair, which is about as fast as you can reasonably hope for; in a larger switch, GCC would organize these into a balanced binary tree of ordered comparisons. Also, because we’re using little-endian, thanks to GCC’s idiom recognition, the conversion from bytes to a 32-bit integer is just a single unaligned fetch. So the whole chunk of 9 lines of C ends up as 20 machine instructions. Interestingly, on RISC-V this is kind of a mess, taking 32 instructions; the conversion from bytes to little-endian is 11 instructions, about what you’d expect from a naïve interpretation of the C; ten of them are contiguous: 54: 00144703 lbu a4,1(s0) # load byte unsigned p[1] 58: 00044783 lbu a5,0(s0) # p[0] 5c: 00344583 lbu a1,3(s0) # p[3] 60: 0722 sll a4,a4,0x8 # p[1] << 8 62: 8f5d or a4,a4,a5 # | p[0] 64: 00244783 lbu a5,2(s0) # p[2] 68: 05e2 sll a1,a1,0x18 # p[3] << 24 6a: 07c2 sll a5,a5,0x10 # p[2] << 16 6c: 8fd9 or a5,a5,a4 6e: 8ddd or a1,a1,a5 Then, each comparison to a constant requires two instructions to load the constant and a third instruction for the branch: 70: 544357b7 lui a5,0x54435 # Upper 20 bits of PICT; 74: 2581 sext.w a1,a1 # interleaved leftover bit of little-endian conversion; 76: 95078793 addi a5,a5,-1712 # f950, low 12 bits of PICT; 7a: 02f58563 beq a1,a5,a4 <.L3> # invoke PICT handler. Compiling to non-Thumb ARM code (`arm-linux-gnueabi-gcc -Wno-multichar -mcpu=cortex-a72 -Os`, also 12.2.0) was a little better, 24 instructions if I exclude the return-0 code, plus a constant pool which RISC-V doesn’t need. (To be fair, neither does recent ARM due to movt, but that would add more instructions.) 2c: e3550002 cmp r5, #2 @ If argc != 2, 30: 1a00000e bne 70 @ go to hell. 34: e5944004 ldr r4, [r4, #4] @ Load argv[1] into a locals register, 38: e1a00004 mov r0, r4 @ then pass it in 3c: ebfffffe bl 0 @ a call to strlen. 40: e3500004 cmp r0, #4 @ If return value != 4, 44: 1a000009 bne 70 @ go to hell. 48: e5941000 ldr r1, [r4] @ Unaligned load; without -mcpu this was 7 ldrb and orr instructions. 4c: e59f3050 ldr r3, [pc, #80] @ a4: 'PICT' 50: e1510003 cmp r1, r3 54: 0a000007 beq 78 58: e59f3048 ldr r3, [pc, #72] @ a8: 'za3k' 5c: e1510003 cmp r1, r3 60: 1a000007 bne 84 64: e59f0040 ldr r0, [pc, #64] @ ac: PC-relative za3k-string pointer 68: e08f0000 add r0, pc, r0 6c: ebfffffe bl 0 70: e3a00000 mov r0, #0 @ set return value to 0 74: e8bd8070 pop {r4, r5, r6, pc} 78: e59f0030 ldr r0, [pc, #48] @ b0 7c: e08f0000 add r0, pc, r0 80: eafffff9 b 6c 84: e59f0028 ldr r0, [pc, #40] @ b4 88: e08f0000 add r0, pc, r0 8c: ebfffffe bl 0 90: eafffff6 b 70 94: 00000078 .word 0x00000078 98: 62636465 .word 0x62636465 9c: 00000070 .word 0x00000070 a0: 64636261 .word 0x64636261 a4: 54434950 .word 0x54434950 @ 'PICT'. Constants before this point are from other parts of the code. a8: 6b33617a .word 0x6b33617a @ 'za3k' ac: 0000003c .word 0x0000003c @ PC-relative string pointers b0: 0000002c .word 0x0000002c b4: 00000024 .word 0x00000024 So, that conditionally defined macro does a reasonable job of solving implementation-definedness, which is the worst problem with multicharacter literals. Still, it would be nice to support literals of more than four bytes (or two bytes on Arduino), to not be dependent on implementation-defined behavior, and to avoid the multi-character literal warning. You probably don’t really want to disable that warning globally, because it’s vastly less common to actually want multi-character literals than it is to forget you aren’t writing Python or JS and try to quote a string with apostrophes. You could brutalize the preprocessor like this: */ #define mclit(s) strlenswitch(#s, emptylit, lit1, lit2, lit3, lit4, lit5) #define strlenswitch(s, f0, f1, f2, f3, f4, f5) ( \ sizeof(s) == 1 ? f0(s) : \ sizeof(s) == 2 ? f1(s) : \ sizeof(s) == 3 ? f2(s) : \ sizeof(s) == 4 ? f3(s) : \ sizeof(s) == 5 ? f4(s) : \ sizeof(s) == 6 ? f5(s) : \ multicharacter_literal_too_long(s)) #define emptylit(s) 0L #define lit1(s) (s[0]) #define lit2(s) (lit1(s) | ((long)(s)[1] << 8)) #define lit3(s) (lit2(s) | ((long)(s)[2] << 16)) #define lit4(s) (lit3(s) | ((long)(s)[3] << 24)) #define lit5(s) (lit4(s) | ((long)(s)[4] << 32)) printf("A is %lx; AB is %lx; ABC is %lx; ABCD is %lx; ABCDE is %lx.\n", mclit(A), mclit(AB), mclit(ABC), mclit(ABCD), mclit(ABCDE)); /* The above code does work, it will compute the same integer from a given string on every compiler, it doesn’t require disabling the multi-character character literal warning, and it handles more than 32 bits in the literal (though I only implemented that up to 40). But it doesn’t permit compile-time erorr detection, and even if you replace the run-time error reporting above with a constant, you can’t use it as a switch case label: multicharlit.c:205:5: error: case label does not reduce to an integer constant 205 | case mclit(PASV): printf("PASV command\n"); break; | ^~~~ That’s from the below code: if (argc == 2 && strlen(argv[1]) == 4) { unsigned char *p = argv[1]; int c = p[0] | p[1] << 8 | p[2] << 16 | p[3] << 24; switch(c) { case mclit(PASV): printf("PASV command\n"); break; default: printf("other command\n"); break; } } On further investigation, even this reduced version produces the same error: case "X"[0]: printf("It’s X!\n"); break; I guess character strings aren’t constant enough to allow indexing into them at compile time in this kind of context. The same problem occurs with this code: enum { PASV = mclit(PASV) }; However, GCC *was* able to fully evaluate the mclit() calls in the printf outside the comment above at compile time: 110c: ba 41 42 00 00 mov $0x4241,%edx 1111: 41 b8 41 42 43 44 mov $0x44434241,%r8d 1117: 31 c0 xor %eax,%eax 1119: 49 b9 41 42 43 44 45 movabs $0x4544434241,%r9 1120: 00 00 00 1123: b9 41 42 43 00 mov $0x434241,%ecx 1128: be 41 00 00 00 mov $0x41,%esi 112d: 48 8d 3d 40 0f 00 00 lea 0xf40(%rip),%rdi # 2074 <_IO_stdin_used+0x74> 1134: e8 27 ff ff ff call 1060 We can leverage this to get a *linker* error if these new multi-character literals are too long, simply by not providing a definition of `multicharacter_literal_too_long`. Even with -O0, GCC is able to constant-propagate and remove the dead code, so the following link-time error doesn't occur if all your multicharacter literals are small enough: /usr/bin/ld: /tmp/ccQbtmEA.o: in function `main': /home/user/dev3/multicharlit.c:192: undefined reference to `multicharacter_literal_too_long' collect2: error: ld returned 1 exit status But the line number is wrong by one, the error message is misleading, and if you omit -g, you don't get a line number at all, just a byte offset into the object file. */ return 0; }