Skip to content

Commit 4188d63

Browse files
committed
in_tail: Add Unicode encoder support
From UTF-16LE, UTF-16BE and UTF-16LE with BOM, UTF-16BE with BOM to UTF-8 are supported. This could be useful for Windows' Unicode insisted logs. They are usually using UTF-16LE with BOM. Signed-off-by: Hiroshi Hatake <[email protected]>
1 parent 5905ace commit 4188d63

File tree

4 files changed

+69
-0
lines changed

4 files changed

+69
-0
lines changed

plugins/in_tail/tail.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -804,6 +804,14 @@ static struct flb_config_map config_map[] = {
804804
},
805805
#endif
806806

807+
#ifdef FLB_HAVE_UNICODE_ENCODER
808+
{
809+
FLB_CONFIG_MAP_STR, "unicode.encoding", NULL,
810+
0, FLB_FALSE, 0,
811+
"specify the preferred input encoding for converting to UTF-8. "
812+
"Currently, UTF-16LE, UTF-16BE, auto are supported.",
813+
},
814+
#endif
807815
/* EOF */
808816
{0}
809817
};

plugins/in_tail/tail_config.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@
3636
#include "tail_multiline.h"
3737
#endif
3838

39+
#ifdef FLB_HAVE_UNICODE_ENCODER
40+
#include <fluent-bit/simdutf/flb_simdutf_connector.h>
41+
#endif
42+
3943
static int multiline_load_parsers(struct flb_tail_config *ctx)
4044
{
4145
struct mk_list *head;
@@ -95,6 +99,9 @@ struct flb_tail_config *flb_tail_config_create(struct flb_input_instance *ins,
9599
#ifdef FLB_HAVE_SQLDB
96100
ctx->db_sync = 1; /* sqlite sync 'normal' */
97101
#endif
102+
#ifdef FLB_HAVE_UNICODE_ENCODER
103+
ctx->preferred_input_encoding = FLB_SIMDUTF_ENCODING_TYPE_UNSPECIFIED;
104+
#endif
98105

99106
/* Load the config map */
100107
ret = flb_input_config_map_set(ins, (void *) ctx);
@@ -178,6 +185,24 @@ struct flb_tail_config *flb_tail_config_create(struct flb_input_instance *ins,
178185
return NULL;
179186
}
180187

188+
#ifdef FLB_HAVE_UNICODE_ENCODER
189+
tmp = flb_input_get_property("unicode.encoding", ins);
190+
if (tmp) {
191+
if (strcasecmp(tmp, "auto") == 0) {
192+
ctx->preferred_input_encoding = FLB_SIMDUTF_ENCODING_TYPE_UNICODE_AUTO;
193+
}
194+
else if (strcasecmp(tmp, "utf-16le") == 0) {
195+
ctx->preferred_input_encoding = FLB_SIMDUTF_ENCODING_TYPE_UTF16_LE;
196+
}
197+
else if (strcasecmp(tmp, "utf-16be") == 0) {
198+
ctx->preferred_input_encoding = FLB_SIMDUTF_ENCODING_TYPE_UTF16_BE;
199+
}
200+
else {
201+
flb_plg_error(ctx->ins, "invalid encoding 'unicode.encoding' value");
202+
}
203+
}
204+
#endif
205+
181206
#ifdef FLB_HAVE_PARSER
182207
/* Config: multi-line support */
183208
if (ctx->multiline == FLB_TRUE) {

plugins/in_tail/tail_config.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,10 @@ struct flb_tail_config {
122122
/* Parser / Format */
123123
struct flb_parser *parser;
124124

125+
#ifdef FLB_HAVE_UNICODE_ENCODER
126+
int preferred_input_encoding;
127+
#endif
128+
125129
/* Multiline */
126130
int multiline; /* multiline enabled ? */
127131
int multiline_flush; /* multiline flush/wait */

plugins/in_tail/tail_file.c

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@
4848
#include "win32.h"
4949
#endif
5050

51+
#ifdef FLB_HAVE_UNICODE_ENCODER
52+
#include <fluent-bit/simdutf/flb_simdutf_connector.h>
53+
#endif
54+
5155
#include <cfl/cfl.h>
5256

5357
static inline void consume_bytes(char *buf, int bytes, int length)
@@ -440,6 +444,10 @@ static int process_content(struct flb_tail_file *file, size_t *bytes)
440444
time_t now = time(NULL);
441445
struct flb_time out_time = {0};
442446
struct flb_tail_config *ctx;
447+
#ifdef FLB_HAVE_UNICODE_ENCODER
448+
char *decoded = NULL;
449+
size_t decoded_len;
450+
#endif
443451

444452
ctx = (struct flb_tail_config *) file->config;
445453

@@ -509,6 +517,24 @@ static int process_content(struct flb_tail_file *file, size_t *bytes)
509517
line_len = len - crlf;
510518
repl_line = NULL;
511519

520+
#ifdef FLB_HAVE_UNICODE_ENCODER
521+
if (ctx->preferred_input_encoding != FLB_SIMDUTF_ENCODING_TYPE_UNSPECIFIED) {
522+
decoded = NULL;
523+
ret = flb_simdutf_connector_convert_from_unicode(ctx->preferred_input_encoding,
524+
line, line_len, &decoded, &decoded_len);
525+
if (ret == FLB_SIMDUTF_CONNECTOR_CONVERT_OK) {
526+
line = decoded;
527+
line_len = decoded_len;
528+
} else if (ret == FLB_SIMDUTF_CONNECTOR_CONVERT_NOP) {
529+
flb_plg_debug(ctx->ins, "nothing to convert encoding '%.*s'", line_len, line);
530+
}
531+
else {
532+
flb_plg_error(ctx->ins, "encoding failed '%.*s'", line_len, line);
533+
goto go_next;
534+
}
535+
}
536+
#endif
537+
512538
if (ctx->ml_ctx) {
513539
ret = flb_ml_append_text(ctx->ml_ctx,
514540
file->ml_stream_id,
@@ -601,6 +627,12 @@ static int process_content(struct flb_tail_file *file, size_t *bytes)
601627
lines++;
602628
file->parsed = 0;
603629
file->last_processed_bytes += processed_bytes;
630+
#ifdef FLB_HAVE_UNICODE_ENCODER
631+
if (decoded) {
632+
flb_free(decoded);
633+
decoded = NULL;
634+
}
635+
#endif
604636
}
605637
file->parsed = file->buf_len;
606638

0 commit comments

Comments
 (0)