Not having to deal with loading data from a bunch of small files makes it easier to work with your executable. You can forget about the working directory, just run your app or tests or benchmarks! Alternatively, if you need to ship a library which needs to access stored data, this is also one way of distributing the data with the library.
This post contains notes about embedding arbitrary data, such as images or compiled shaders, into a C or C++ program at build time. Doing this isn’t hard, but it can be tedious setting everything up with your build system of choice. Hopefully this saves someone time when integrating an embedding scheme in their build system of choice!
I am using CMake as a build system, and so embedding data is going to be driven by CMake. Ideally, no other tools than your build system should be required for this. It’s always painful when additional tools need to be installed and set up in order to build something.
As an aside, embedding data will not require any build tools in the future. Once our compilers implement C23, the easiest way of embedding data will be to simply call #embed
and call it a day.
1
2
3
|
const uint8_t image_data[] = {
#embed "image.png"
};
|
But at the time of writing, #embed
is not yet available and we must do it ourselves.
C program for embedding a file
First, we need a small C program, called embed-file
. Given a single file to be embedded, and a C header and source file, it will embed the input file as a single byte array.
Calling the program like this
$ embed-file assets/image.png embedded_data.h embedded_data.c
will output the following declarations into the header file,
1
2
|
extern char const IMAGE_PNG_BYTES[];
extern size_t const IMAGE_PNG_LENGTH;
|
and the following definitions into the source file.
1
2
3
4
5
6
|
char const IMAGE_PNG_BYTES[] = {
0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00, 0x00, 0x0d, 0x49, 0x48, 0x44, 0x52,
0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x08, 0x06, 0x00, 0x00, 0x00, 0xc3, 0x3e, 0x61,
// ...
};
size_t const IMAGE_PNG_LENGTH = sizeof(IMAGE_PNG_BYTES);
|
The C source is straightforward, and has the following assumptions or limitations:
- Only one file name extentions is supported. Files like
data.foo.bar
will yield broken variable names.
- File names must start with letters.
- Files without extentions will yield functioning code, but the variable name will look strange. The name
binary_file
will result in BINARY_FILE__BYTES[]
being generated.
These limitations could be solved, but since I currently only have files with filename.ext
this is good enough.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
|
#include <assert.h>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define BUFFER_SIZE 256
int main(int argc, char** argv)
{
if (argc != 4)
{
fprintf(stderr, "Usage: %s <input_binary_file> <output_h_file> <output_c_file>\n", argv[0]);
return EXIT_FAILURE;
}
char const* const input_binary_path = argv[1];
char const* const output_h_path = argv[2];
char const* const output_c_path = argv[3];
//
// Open input binary file
//
FILE* const input_file = fopen(input_binary_path, "rb");
if (!input_file)
{
fprintf(stderr, "Failed to open input file: %s\n", input_binary_path);
return EXIT_FAILURE;
}
fseek(input_file, 0, SEEK_END);
long const file_size = ftell(input_file);
fseek(input_file, 0, SEEK_SET);
//
// Read into buffer
//
unsigned char* const buffer = malloc(file_size);
if (!buffer)
{
fprintf(stderr, "Failed to allocate memory for file buffer\n");
fclose(input_file);
return EXIT_FAILURE;
}
fread(buffer, 1, file_size, input_file);
fclose(input_file);
//
// Extract file name and extension, in upper-case
//
assert(strlen(input_binary_path) < BUFFER_SIZE - 1);
char file_name_upper[BUFFER_SIZE];
char file_ext_upper[BUFFER_SIZE];
{
char* input_path_copy = strdup(input_binary_path);
// NOTE: `strrchr` returns last occurrence
char* base_name = strrchr(input_path_copy, '/');
char* file_name = base_name ? base_name + 1 : input_path_copy;
char* file_ext = strrchr(file_name, '.');
if (file_ext)
{
*file_ext = '\0';
file_ext++;
}
else
{
file_ext = "";
}
for (char* c = file_name; *c; ++c)
{
*c = toupper(*c);
}
for (char* c = file_ext; *c; ++c)
{
*c = toupper(*c);
}
snprintf(file_name_upper, sizeof(file_name_upper), "%s", file_name);
snprintf(file_ext_upper, sizeof(file_ext_upper), "%s", file_ext);
free(input_path_copy);
}
//
// Write to output files
//
{
FILE* const h_file = fopen(output_h_path, "a");
if (!h_file)
{
fprintf(stderr, "Failed to open output file: %s\n", output_h_path);
free(buffer);
return EXIT_FAILURE;
}
FILE* const c_file = fopen(output_c_path, "a");
if (!c_file)
{
fprintf(stderr, "Failed to open output file: %s\n", output_c_path);
fclose(h_file);
free(buffer);
return EXIT_FAILURE;
}
fprintf(h_file, "\nextern char const %s_%s_BYTES[];\n", file_name_upper, file_ext_upper);
fprintf(h_file, "extern size_t const %s_%s_LENGTH;\n", file_name_upper, file_ext_upper);
fprintf(c_file, "\nchar const %s_%s_BYTES[] = {\n ", file_name_upper, file_ext_upper);
for (long i = 0; i < file_size; ++i)
{
fprintf(c_file, "0x%02x, ", buffer[i]);
if ((i + 1) % 16 == 0)
{
fprintf(c_file, "\n ");
}
}
fprintf(c_file, "\n};\n");
fprintf(
c_file, "\nsize_t const %s_%s_LENGTH = sizeof(%s_%s_BYTES);\n", file_name_upper,
file_ext_upper, file_name_upper, file_ext_upper);
fclose(c_file);
fclose(h_file);
}
free(buffer);
return EXIT_SUCCESS;
}
|
CMake wrapper script for embedding files
A CMake script is required for iterating through a list of files and calling embed-file
on them. This script is called embed_data.cmake
and I usually place these sorts of scripts in a cmake
subdirectory in all my projects. A number of oddities are apparent in this script.
EMBED_FILE_EXE
the location of the executable is passed as a parameter. This script is executed as an external script, and the usual CMake variables like CMAKE_BINARY_DIR
are blank.
INPUT_FILES_LIST_STR
Lists in CMake sre strings separated by the ';'
character.
Note that the CMake script initializes the header and source file with the necessary pragmas, includes, and comments, and embed-file
appends the variables containing the file content.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
# embed_data.cmake
function(embed_data EMBED_FILE_EXE INPUT_FILES_LIST_STR H_FILE C_FILE)
file(WRITE ${H_FILE} "\
// This file is generated by CMake. Do not manually edit this file!\n\
\n\
#pragma once\n\
\n\
#include <stddef.h>\n")
get_filename_component(H_FILENAME_COMPONENT ${H_FILE} NAME)
file(WRITE ${C_FILE} "\
// This file is generated by CMake. Do not manually edit this file!\n\
\n\
#include \"${H_FILENAME_COMPONENT}\"\n")
a
string(REPLACE " " ";" BINARY_FILES ${INPUT_FILES_LIST_STR})
foreach(BINARY_FILE IN LISTS BINARY_FILES)
# `embed-file` called here
execute_process(
COMMAND ${EMBED_FILE_EXE} ${BINARY_FILE} ${H_FILE} ${C_FILE}
RESULT_VARIABLE RESULT
OUTPUT_VARIABLE OUTPUT
ERROR_VARIABLE ERROR
COMMAND_ECHO STDOUT)
if(NOT ${RESULT} EQUAL 0)
message(FATAL_ERROR "Failed to embed ${BINARY_FILE}: ${ERROR}")
endif()
endforeach()
endfunction()
embed_data("${EMBED_FILE_EXE}" "${INPUT_FILES}" "${OUTPUT_H_FILE}" "${OUTPUT_C_FILE}")
|
Putting everything together in CMakeLists.txt
Finally, we need to generate a target which call the script for the files we want to embed. First, embed-file
needs to be built.
1
2
|
# embed-file
add_executable(embed-file src/embed_file/main.c)
|
We now have everything we need to define a custom command which will embed the files we list into source code at build time.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
set(EMBED_DATA_SOURCE_FILES
${CMAKE_SOURCE_DIR}/assets/image1.png
${CMAKE_SOURCE_DIR}/assets/image2.png)
set(EMBEDDED_DATA_H ${CMAKE_SOURCE_DIR}/src/embedded_data.h)
set(EMBEDDED_DATA_C ${CMAKE_SOURCE_DIR}/src/embedded_data.c)
add_custom_command(
OUTPUT ${EMBEDDED_DATA_H} ${EMBEDDED_DATA_C}
COMMAND ${CMAKE_COMMAND} -DEMBED_FILE_EXE="$<TARGET_FILE:embed-file>"
-DINPUT_FILES="${EMBED_DATA_SOURCE_FILES}"
-DOUTPUT_H_FILE=${EMBEDDED_DATA_H}
-DOUTPUT_C_FILE=${EMBEDDED_DATA_C}
-P ${CMAKE_SOURCE_DIR}/cmake/embed_data.cmake
COMMENT "Embedding data"
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
DEPENDS ${EMBED_DATA_SOURCE_FILES}) # If input unchanged, the command is not executed
|
The location of the embed-file
executable is given by -DEMBED_FILE_EXE="$<TARGET_FILE:embed-file>"
. But we need to ensure that we actually build it before this custom command is executed. To do so, a custom target is created which will depend on the output source code of our command.
1
|
add_custom_target(embed-data DEPENDS ${EMBEDDED_DATA_C})
|
We can now add embed-file
as a dependency of embed-data
.
1
|
add_dependencies(embed-data embed-file)
|
embed-data
can now be used as a dependency in executable targets. Any changes to the input files or file list will result in the files being re-embedded at build time.
1
|
add_dependencies(my-app embed-data)
|
If you need to retrigger the embedding command without the input files changing, you can remove the dependency DEPENDS ${EMBED_DATA_SOURCE_FILES}
from the command, or you can touch the files.
$ touch assets/image1.png
Masochism: implementing embed-file
in CMake
If, for some reason, you want a pure-CMake solution, it is possible to embed data with a CMake script. This approach is orders of magnitude slower, and will start to result in flow-breaking slow build times when the file sizes exceed 50 kiB. But writing this sort of functionality in CMake makes for an interesting, if painful, exercise. Note that the following code was written in the middle of the night with a heavy dose of AI assistance.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
|
function(binary_file_to_c_array INPUT_BINARY_FILE OUTPUT_H_FILE OUTPUT_C_FILE)
file(READ ${INPUT_BINARY_FILE} HEX_CONTENT CONTENT HEX)
# Split the hex string into a list of two bytes each.
string(REGEX MATCHALL "([A-Fa-f0-9][A-Fa-f0-9])" SPLIT_HEX ${HEX_CONTENT})
# https://cmake.org/cmake/help/latest/command/get_filename_component.html
get_filename_component(FILE_NAME_WE ${INPUT_BINARY_FILE} NAME_WE)
get_filename_component(FILE_EXT ${INPUT_BINARY_FILE} LAST_EXT)
string(SUBSTRING ${FILE_EXT} 1 -1 FILE_EXT)
string(TOUPPER ${FILE_NAME_WE} FILE_NAME_UPPER)
string(TOUPPER ${FILE_EXT} FILE_EXT_UPPER)
set(H_OUTPUT "\nextern char const ${FILE_NAME_UPPER}_${FILE_EXT_UPPER}_BYTES[]\;\n")
string(APPEND H_OUTPUT "extern size_t const ${FILE_NAME_UPPER}_${FILE_EXT_UPPER}_LENGTH\;\n")
set(C_OUTPUT "\nchar const ${FILE_NAME_UPPER}_${FILE_EXT_UPPER}_BYTES[] = {\n ")
set(COUNTER 1)
set(SPLIT_HEX_IDX 0)
list(LENGTH SPLIT_HEX SPLIT_HEX_LENGTH)
foreach(HEX IN LISTS SPLIT_HEX)
string(APPEND C_OUTPUT "0x${HEX}, ")
math(EXPR COUNTER "${COUNTER} + 1")
math(EXPR SPLIT_HEX_IDX "${SPLIT_HEX_IDX} + 1")
if(SPLIT_HEX_IDX LESS SPLIT_HEX_LENGTH)
if(COUNTER GREATER 16)
string(APPEND C_OUTPUT "\n ")
set(COUNTER 1)
endif()
endif()
endforeach()
string(APPEND C_OUTPUT "\n}\;\n")
string(APPEND C_OUTPUT "\nsize_t const ${FILE_NAME_UPPER}_${FILE_EXT_UPPER}_LENGTH = sizeof(${FILE_NAME_UPPER}_${FILE_EXT_UPPER}_BYTES)\;\n")
file(APPEND ${OUTPUT_H_FILE} ${H_OUTPUT})
file(APPEND ${OUTPUT_C_FILE} ${C_OUTPUT})
endfunction()
|