libpqxx
The C++ client library for PostgreSQL
encodings.hxx
1 
9 #ifndef PQXX_H_ENCODINGS
10 #define PQXX_H_ENCODINGS
11 
12 #include <iomanip>
13 #include <string>
14 #include <string_view>
15 
16 #include "pqxx/internal/concat.hxx"
17 #include "pqxx/internal/encoding_group.hxx"
18 
19 
20 namespace pqxx
21 {
22 PQXX_DECLARE_ENUM_CONVERSION(pqxx::internal::encoding_group);
23 } // namespace pqxx
24 
25 
26 namespace pqxx::internal
27 {
29 PQXX_PURE char const *name_encoding(int encoding_id);
30 
32 PQXX_LIBEXPORT encoding_group enc_group(int /* libpq encoding ID */);
33 
34 
36 
40 PQXX_LIBEXPORT glyph_scanner_func *get_glyph_scanner(encoding_group);
41 
42 
43 // TODO: Get rid of this one. Use compile-time-specialised version instead.
45 
51 template<char... NEEDLE>
52 inline std::size_t find_char(
53  glyph_scanner_func *scanner, std::string_view haystack,
54  std::size_t here = 0u)
55 {
56  auto const sz{std::size(haystack)};
57  auto const data{std::data(haystack)};
58  while (here < sz)
59  {
60  auto next{scanner(data, sz, here)};
61  PQXX_ASSUME(next > here);
62  // (For some reason gcc had a problem with a right-fold here. But clang
63  // was fine.)
64  if ((... or (data[here] == NEEDLE)))
65  {
66  // Also check against a multibyte character starting with a bytes which
67  // just happens to match one of the ASCII bytes we're looking for. It'd
68  // be cleaner to check that first, but either works. So, let's apply the
69  // most selective filter first and skip this check in almost all cases.
70  if (next == here + 1)
71  return here;
72  }
73 
74  // Nope, no hit. Move on.
75  here = next;
76  }
77  return sz;
78 }
79 
80 
81 // TODO: Get rid of this one. Use compile-time-specialised loop instead.
83 
86 template<typename CALLABLE>
87 inline void for_glyphs(
88  encoding_group enc, CALLABLE callback, char const buffer[],
89  std::size_t buffer_len, std::size_t start = 0)
90 {
91  auto const scan{get_glyph_scanner(enc)};
92  for (std::size_t here = start, next; here < buffer_len; here = next)
93  {
94  next = scan(buffer, buffer_len, here);
95  PQXX_ASSUME(next > here);
96  callback(buffer + here, buffer + next);
97  }
98 }
99 
100 
101 namespace
102 {
104 constexpr PQXX_PURE unsigned char
105 get_byte(char const buffer[], std::size_t offset) noexcept
106 {
107  return static_cast<unsigned char>(buffer[offset]);
108 }
109 
110 
111 [[noreturn]] PQXX_COLD void throw_for_encoding_error(
112  char const *encoding_name, char const buffer[], std::size_t start,
113  std::size_t count)
114 {
115  std::stringstream s;
116  s << "Invalid byte sequence for encoding " << encoding_name << " at byte "
117  << start << ": " << std::hex << std::setw(2) << std::setfill('0');
118  for (std::size_t i{0}; i < count; ++i)
119  {
120  s << "0x" << static_cast<unsigned int>(get_byte(buffer, start + i));
121  if (i + 1 < count)
122  s << " ";
123  }
124  throw pqxx::argument_error{s.str()};
125 }
126 
127 
129 constexpr PQXX_PURE bool
130 between_inc(unsigned char value, unsigned bottom, unsigned top)
131 {
132  return value >= bottom and value <= top;
133 }
134 } // namespace
135 
136 
138 
142 template<encoding_group> struct glyph_scanner
143 {
144  // TODO: Convert to use string_view?
146  PQXX_PURE static std::size_t
147  call(char const buffer[], std::size_t buffer_len, std::size_t start);
148 };
149 
150 
151 namespace
152 {
154 
160 template<encoding_group ENC, char... NEEDLE>
161 PQXX_PURE inline std::size_t
162 find_ascii_char(std::string_view haystack, std::size_t here)
163 {
164  // We only know how to search for ASCII characters. It's an optimisation
165  // assumption in the code below.
166  static_assert((... and ((NEEDLE & 0x80) == 0)));
167 
168  auto const sz{std::size(haystack)};
169  auto const data{std::data(haystack)};
170  while (here < sz)
171  {
172  // Look up the next character boundary. This can be quite costly, so we
173  // desperately want the call inlined.
174  auto next{glyph_scanner<ENC>::call(data, sz, here)};
175  PQXX_ASSUME(next > here);
176 
177  // (For some reason gcc had a problem with a right-fold here. But clang
178  // was fine.)
179  //
180  // In all supported encodings, if a character's first byte is in the ASCII
181  // range, that means it's a single-byte character. It follows that when we
182  // find a match, we do not need to check that we're in a single-byte
183  // character:
184  //
185  // If this is an "ASCII-unsafe" encoding, e.g. SJIS, we're only checking
186  // each character's first byte. That first byte can only match NEEDLE if
187  // it's a single-byte character.
188  //
189  // In an "ASCII-safe" encoding, e.g. UTF-8 or the ISO-8859 ones, we check
190  // for a match at each byte in the text, because it's faster than finding
191  // character boundaries first. But in these encodings, a multichar byte
192  // never contains any bytes in the ASCII range at all.
193  if ((... or (data[here] == NEEDLE)))
194  return here;
195 
196  // Nope, no hit. Move on.
197  here = next;
198  }
199  return sz;
200 }
201 } // namespace
202 
203 
205 
209 template<encoding_group ENC, char... NEEDLE>
210 PQXX_PURE std::size_t
211 find_s_ascii_char(std::string_view haystack, std::size_t here)
212 {
213  // We only know how to search for ASCII characters. It's an optimisation
214  // assumption in the code below.
215  static_assert((... and ((NEEDLE >> 7) == 0)));
216 
217  auto const sz{std::size(haystack)};
218  auto const data{std::data(haystack)};
219 
220  // No supported encoding has multibyte characters that start with an
221  // ASCII-range byte.
222  while ((... and (data[here] != NEEDLE)))
223  {
224  auto const next = glyph_scanner<ENC>::call(data, sz, here);
225  PQXX_ASSUME(next > here);
226  here = next;
227  }
228  return here;
229 }
230 
231 
232 template<> struct glyph_scanner<encoding_group::MONOBYTE>
233 {
234  static PQXX_PURE constexpr std::size_t
235  call(char const /* buffer */[], std::size_t buffer_len, std::size_t start)
236  {
237  // TODO: Don't bother with npos. Let the caller check.
238  if (start >= buffer_len)
239  PQXX_UNLIKELY return std::string::npos;
240  else
241  return start + 1;
242  }
243 };
244 
245 
246 // https://en.wikipedia.org/wiki/Big5#Organization
247 template<> struct glyph_scanner<encoding_group::BIG5>
248 {
249  static PQXX_PURE std::size_t
250  call(char const buffer[], std::size_t buffer_len, std::size_t start)
251  {
252  if (start >= buffer_len)
253  PQXX_UNLIKELY return std::string::npos;
254 
255  auto const byte1{get_byte(buffer, start)};
256  if (byte1 < 0x80)
257  return start + 1;
258 
259  if (not between_inc(byte1, 0x81, 0xfe) or (start + 2 > buffer_len))
260  PQXX_UNLIKELY
261  throw_for_encoding_error("BIG5", buffer, start, 1);
262 
263  auto const byte2{get_byte(buffer, start + 1)};
264  if (
265  not between_inc(byte2, 0x40, 0x7e) and
266  not between_inc(byte2, 0xa1, 0xfe))
267  PQXX_UNLIKELY
268  throw_for_encoding_error("BIG5", buffer, start, 2);
269 
270  return start + 2;
271  }
272 };
273 
274 
275 /*
276 The PostgreSQL documentation claims that the EUC_* encodings are 1-3 bytes
277 each, but other documents explain that the EUC sets can contain 1-(2,3,4) bytes
278 depending on the specific extension:
279  EUC_CN : 1-2
280  EUC_JP : 1-3
281  EUC_JIS_2004: 1-2
282  EUC_KR : 1-2
283  EUC_TW : 1-4
284 */
285 
286 // https://en.wikipedia.org/wiki/GB_2312#EUC-CN
287 template<> struct glyph_scanner<encoding_group::EUC_CN>
288 {
289  static PQXX_PURE std::size_t
290  call(char const buffer[], std::size_t buffer_len, std::size_t start)
291  {
292  if (start >= buffer_len)
293  return std::string::npos;
294 
295  auto const byte1{get_byte(buffer, start)};
296  if (byte1 < 0x80)
297  return start + 1;
298 
299  if (not between_inc(byte1, 0xa1, 0xf7) or start + 2 > buffer_len)
300  PQXX_UNLIKELY
301  throw_for_encoding_error("EUC_CN", buffer, start, 1);
302 
303  auto const byte2{get_byte(buffer, start + 1)};
304  if (not between_inc(byte2, 0xa1, 0xfe))
305  PQXX_UNLIKELY
306  throw_for_encoding_error("EUC_CN", buffer, start, 2);
307 
308  return start + 2;
309  }
310 };
311 
312 
313 // EUC-JP and EUC-JIS-2004 represent slightly different code points but iterate
314 // the same:
315 //
316 // https://en.wikipedia.org/wiki/Extended_Unix_Code#EUC-JP
317 // http://x0213.org/codetable/index.en.html
318 template<> struct glyph_scanner<encoding_group::EUC_JP>
319 {
320  static PQXX_PURE std::size_t
321  call(char const buffer[], std::size_t buffer_len, std::size_t start)
322  {
323  if (start >= buffer_len)
324  return std::string::npos;
325 
326  auto const byte1{get_byte(buffer, start)};
327  if (byte1 < 0x80)
328  return start + 1;
329 
330  if (start + 2 > buffer_len)
331  PQXX_UNLIKELY
332  throw_for_encoding_error("EUC_JP", buffer, start, 1);
333 
334  auto const byte2{get_byte(buffer, start + 1)};
335  if (byte1 == 0x8e)
336  {
337  if (not between_inc(byte2, 0xa1, 0xfe))
338  PQXX_UNLIKELY
339  throw_for_encoding_error("EUC_JP", buffer, start, 2);
340 
341  return start + 2;
342  }
343 
344  if (between_inc(byte1, 0xa1, 0xfe))
345  {
346  if (not between_inc(byte2, 0xa1, 0xfe))
347  PQXX_UNLIKELY
348  throw_for_encoding_error("EUC_JP", buffer, start, 2);
349 
350  return start + 2;
351  }
352 
353  if (byte1 == 0x8f and start + 3 <= buffer_len)
354  {
355  auto const byte3{get_byte(buffer, start + 2)};
356  if (
357  not between_inc(byte2, 0xa1, 0xfe) or
358  not between_inc(byte3, 0xa1, 0xfe))
359  PQXX_UNLIKELY
360  throw_for_encoding_error("EUC_JP", buffer, start, 3);
361 
362  return start + 3;
363  }
364 
365  throw_for_encoding_error("EUC_JP", buffer, start, 1);
366  }
367 };
368 
369 
370 // https://en.wikipedia.org/wiki/Extended_Unix_Code#EUC-KR
371 template<> struct glyph_scanner<encoding_group::EUC_KR>
372 {
373  static PQXX_PURE std::size_t
374  call(char const buffer[], std::size_t buffer_len, std::size_t start)
375  {
376  if (start >= buffer_len)
377  PQXX_UNLIKELY return std::string::npos;
378 
379  auto const byte1{get_byte(buffer, start)};
380  if (byte1 < 0x80)
381  return start + 1;
382 
383  if (not between_inc(byte1, 0xa1, 0xfe) or start + 2 > buffer_len)
384  PQXX_UNLIKELY
385  throw_for_encoding_error("EUC_KR", buffer, start, 1);
386 
387  auto const byte2{get_byte(buffer, start + 1)};
388  if (not between_inc(byte2, 0xa1, 0xfe))
389  PQXX_UNLIKELY
390  throw_for_encoding_error("EUC_KR", buffer, start, 1);
391 
392  return start + 2;
393  }
394 };
395 
396 
397 // https://en.wikipedia.org/wiki/Extended_Unix_Code#EUC-TW
398 template<> struct glyph_scanner<encoding_group::EUC_TW>
399 {
400  static PQXX_PURE std::size_t
401  call(char const buffer[], std::size_t buffer_len, std::size_t start)
402  {
403  if (start >= buffer_len)
404  PQXX_UNLIKELY
405  return std::string::npos;
406 
407  auto const byte1{get_byte(buffer, start)};
408  if (byte1 < 0x80)
409  return start + 1;
410 
411  if (start + 2 > buffer_len)
412  PQXX_UNLIKELY
413  throw_for_encoding_error("EUC_KR", buffer, start, 1);
414 
415  auto const byte2{get_byte(buffer, start + 1)};
416  if (between_inc(byte1, 0xa1, 0xfe))
417  {
418  if (not between_inc(byte2, 0xa1, 0xfe))
419  PQXX_UNLIKELY
420  throw_for_encoding_error("EUC_KR", buffer, start, 2);
421 
422  return start + 2;
423  }
424 
425  if (byte1 != 0x8e or start + 4 > buffer_len)
426  PQXX_UNLIKELY
427  throw_for_encoding_error("EUC_KR", buffer, start, 1);
428 
429  if (
430  between_inc(byte2, 0xa1, 0xb0) and
431  between_inc(get_byte(buffer, start + 2), 0xa1, 0xfe) and
432  between_inc(get_byte(buffer, start + 3), 0xa1, 0xfe))
433  return start + 4;
434 
435  PQXX_UNLIKELY
436  throw_for_encoding_error("EUC_KR", buffer, start, 4);
437  }
438 };
439 
440 
441 // https://en.wikipedia.org/wiki/GB_18030#Mapping
442 template<> struct glyph_scanner<encoding_group::GB18030>
443 {
444  static PQXX_PURE std::size_t
445  call(char const buffer[], std::size_t buffer_len, std::size_t start)
446  {
447  if (start >= buffer_len)
448  PQXX_UNLIKELY return std::string::npos;
449 
450  auto const byte1{get_byte(buffer, start)};
451  if (byte1 < 0x80)
452  return start + 1;
453  if (byte1 == 0x80)
454  throw_for_encoding_error("GB18030", buffer, start, buffer_len - start);
455 
456  if (start + 2 > buffer_len)
457  PQXX_UNLIKELY
458  throw_for_encoding_error("GB18030", buffer, start, buffer_len - start);
459 
460  auto const byte2{get_byte(buffer, start + 1)};
461  if (between_inc(byte2, 0x40, 0xfe))
462  {
463  if (byte2 == 0x7f)
464  PQXX_UNLIKELY
465  throw_for_encoding_error("GB18030", buffer, start, 2);
466 
467  return start + 2;
468  }
469 
470  if (start + 4 > buffer_len)
471  PQXX_UNLIKELY
472  throw_for_encoding_error("GB18030", buffer, start, buffer_len - start);
473 
474  if (
475  between_inc(byte2, 0x30, 0x39) and
476  between_inc(get_byte(buffer, start + 2), 0x81, 0xfe) and
477  between_inc(get_byte(buffer, start + 3), 0x30, 0x39))
478  return start + 4;
479 
480  PQXX_UNLIKELY
481  throw_for_encoding_error("GB18030", buffer, start, 4);
482  }
483 };
484 
485 
486 // https://en.wikipedia.org/wiki/GBK_(character_encoding)#Encoding
487 template<> struct glyph_scanner<encoding_group::GBK>
488 {
489  static PQXX_PURE std::size_t
490  call(char const buffer[], std::size_t buffer_len, std::size_t start)
491  {
492  if (start >= buffer_len)
493  PQXX_UNLIKELY return std::string::npos;
494 
495  auto const byte1{get_byte(buffer, start)};
496  if (byte1 < 0x80)
497  return start + 1;
498 
499  if (start + 2 > buffer_len)
500  PQXX_UNLIKELY
501  throw_for_encoding_error("GBK", buffer, start, 1);
502 
503  auto const byte2{get_byte(buffer, start + 1)};
504  if (
505  (between_inc(byte1, 0xa1, 0xa9) and between_inc(byte2, 0xa1, 0xfe)) or
506  (between_inc(byte1, 0xb0, 0xf7) and between_inc(byte2, 0xa1, 0xfe)) or
507  (between_inc(byte1, 0x81, 0xa0) and between_inc(byte2, 0x40, 0xfe) and
508  byte2 != 0x7f) or
509  (between_inc(byte1, 0xaa, 0xfe) and between_inc(byte2, 0x40, 0xa0) and
510  byte2 != 0x7f) or
511  (between_inc(byte1, 0xa8, 0xa9) and between_inc(byte2, 0x40, 0xa0) and
512  byte2 != 0x7f) or
513  (between_inc(byte1, 0xaa, 0xaf) and between_inc(byte2, 0xa1, 0xfe)) or
514  (between_inc(byte1, 0xf8, 0xfe) and between_inc(byte2, 0xa1, 0xfe)) or
515  (between_inc(byte1, 0xa1, 0xa7) and between_inc(byte2, 0x40, 0xa0) and
516  byte2 != 0x7f))
517  return start + 2;
518 
519  PQXX_UNLIKELY
520  throw_for_encoding_error("GBK", buffer, start, 2);
521  }
522 };
523 
524 
525 /*
526 The PostgreSQL documentation claims that the JOHAB encoding is 1-3 bytes, but
527 "CJKV Information Processing" describes it (actually just the Hangul portion)
528 as "three five-bit segments" that reside inside 16 bits (2 bytes).
529 
530 CJKV Information Processing by Ken Lunde, pg. 269:
531 
532  https://bit.ly/2BEOu5V
533 */
534 template<> struct glyph_scanner<encoding_group::JOHAB>
535 {
536  static PQXX_PURE std::size_t
537  call(char const buffer[], std::size_t buffer_len, std::size_t start)
538  {
539  if (start >= buffer_len)
540  PQXX_UNLIKELY return std::string::npos;
541 
542  auto const byte1{get_byte(buffer, start)};
543  if (byte1 < 0x80)
544  return start + 1;
545 
546  if (start + 2 > buffer_len)
547  PQXX_UNLIKELY
548  throw_for_encoding_error("JOHAB", buffer, start, 1);
549 
550  auto const byte2{get_byte(buffer, start)};
551  if (
552  (between_inc(byte1, 0x84, 0xd3) and
553  (between_inc(byte2, 0x41, 0x7e) or between_inc(byte2, 0x81, 0xfe))) or
554  ((between_inc(byte1, 0xd8, 0xde) or between_inc(byte1, 0xe0, 0xf9)) and
555  (between_inc(byte2, 0x31, 0x7e) or between_inc(byte2, 0x91, 0xfe))))
556  return start + 2;
557 
558  PQXX_UNLIKELY
559  throw_for_encoding_error("JOHAB", buffer, start, 2);
560  }
561 };
562 
563 
564 /*
565 PostgreSQL's MULE_INTERNAL is the emacs rather than Xemacs implementation;
566 see the server/mb/pg_wchar.h PostgreSQL header file.
567 This is implemented according to the description in said header file, but I was
568 unable to get it to successfully iterate a MULE-encoded test CSV generated
569 using PostgreSQL 9.2.23. Use this at your own risk.
570 */
571 template<> struct glyph_scanner<encoding_group::MULE_INTERNAL>
572 {
573  static PQXX_PURE std::size_t
574  call(char const buffer[], std::size_t buffer_len, std::size_t start)
575  {
576  if (start >= buffer_len)
577  PQXX_UNLIKELY return std::string::npos;
578 
579  auto const byte1{get_byte(buffer, start)};
580  if (byte1 < 0x80)
581  return start + 1;
582 
583  if (start + 2 > buffer_len)
584  PQXX_UNLIKELY
585  throw_for_encoding_error("MULE_INTERNAL", buffer, start, 1);
586 
587  auto const byte2{get_byte(buffer, start + 1)};
588  if (between_inc(byte1, 0x81, 0x8d) and byte2 >= 0xa0)
589  return start + 2;
590 
591  if (start + 3 > buffer_len)
592  PQXX_UNLIKELY
593  throw_for_encoding_error("MULE_INTERNAL", buffer, start, 2);
594 
595  if (
596  ((byte1 == 0x9a and between_inc(byte2, 0xa0, 0xdf)) or
597  (byte1 == 0x9b and between_inc(byte2, 0xe0, 0xef)) or
598  (between_inc(byte1, 0x90, 0x99) and byte2 >= 0xa0)) and
599  (byte2 >= 0xa0))
600  return start + 3;
601 
602  if (start + 4 > buffer_len)
603  PQXX_UNLIKELY
604  throw_for_encoding_error("MULE_INTERNAL", buffer, start, 3);
605 
606  if (
607  ((byte1 == 0x9c and between_inc(byte2, 0xf0, 0xf4)) or
608  (byte1 == 0x9d and between_inc(byte2, 0xf5, 0xfe))) and
609  get_byte(buffer, start + 2) >= 0xa0 and
610  get_byte(buffer, start + 4) >= 0xa0)
611  return start + 4;
612 
613  PQXX_UNLIKELY
614  throw_for_encoding_error("MULE_INTERNAL", buffer, start, 4);
615  }
616 };
617 
618 
619 // As far as I can tell, for the purposes of iterating the only difference
620 // between SJIS and SJIS-2004 is increased range in the first byte of two-byte
621 // sequences (0xEF increased to 0xFC). Officially, that is; apparently the
622 // version of SJIS used by Postgres has the same range as SJIS-2004. They both
623 // have increased range over the documented versions, not having the even/odd
624 // restriction for the first byte in 2-byte sequences.
625 //
626 // https://en.wikipedia.org/wiki/Shift_JIS#Shift_JIS_byte_map
627 // http://x0213.org/codetable/index.en.html
628 template<> struct glyph_scanner<encoding_group::SJIS>
629 {
630  static PQXX_PURE std::size_t
631  call(char const buffer[], std::size_t buffer_len, std::size_t start)
632  {
633  if (start >= buffer_len)
634  return std::string::npos;
635 
636  auto const byte1{get_byte(buffer, start)};
637  if (byte1 < 0x80 or between_inc(byte1, 0xa1, 0xdf))
638  return start + 1;
639 
640  if (
641  not between_inc(byte1, 0x81, 0x9f) and
642  not between_inc(byte1, 0xe0, 0xfc))
643  PQXX_UNLIKELY
644  throw_for_encoding_error("SJIS", buffer, start, 1);
645 
646  if (start + 2 > buffer_len)
647  PQXX_UNLIKELY
648  throw_for_encoding_error("SJIS", buffer, start, buffer_len - start);
649 
650  auto const byte2{get_byte(buffer, start + 1)};
651  if (byte2 == 0x7f)
652  PQXX_UNLIKELY
653  throw_for_encoding_error("SJIS", buffer, start, 2);
654 
655  if (between_inc(byte2, 0x40, 0x9e) or between_inc(byte2, 0x9f, 0xfc))
656  return start + 2;
657 
658  PQXX_UNLIKELY
659  throw_for_encoding_error("SJIS", buffer, start, 2);
660  }
661 };
662 
663 
664 // https://en.wikipedia.org/wiki/Unified_Hangul_Code
665 template<> struct glyph_scanner<encoding_group::UHC>
666 {
667  static PQXX_PURE std::size_t
668  call(char const buffer[], std::size_t buffer_len, std::size_t start)
669  {
670  if (start >= buffer_len)
671  PQXX_UNLIKELY return std::string::npos;
672 
673  auto const byte1{get_byte(buffer, start)};
674  if (byte1 < 0x80)
675  return start + 1;
676 
677  if (start + 2 > buffer_len)
678  PQXX_UNLIKELY
679  throw_for_encoding_error("UHC", buffer, start, buffer_len - start);
680 
681  auto const byte2{get_byte(buffer, start + 1)};
682  if (between_inc(byte1, 0x80, 0xc6))
683  {
684  if (
685  between_inc(byte2, 0x41, 0x5a) or between_inc(byte2, 0x61, 0x7a) or
686  between_inc(byte2, 0x80, 0xfe))
687  return start + 2;
688 
689  PQXX_UNLIKELY
690  throw_for_encoding_error("UHC", buffer, start, 2);
691  }
692 
693  if (between_inc(byte1, 0xa1, 0xfe))
694  {
695  if (not between_inc(byte2, 0xa1, 0xfe))
696  PQXX_UNLIKELY
697  throw_for_encoding_error("UHC", buffer, start, 2);
698 
699  return start + 2;
700  }
701 
702  throw_for_encoding_error("UHC", buffer, start, 1);
703  }
704 };
705 
706 
707 // https://en.wikipedia.org/wiki/UTF-8#Description
708 template<> struct glyph_scanner<encoding_group::UTF8>
709 {
710  static PQXX_PURE std::size_t
711  call(char const buffer[], std::size_t buffer_len, std::size_t start)
712  {
713  if (start >= buffer_len)
714  PQXX_UNLIKELY return std::string::npos;
715 
716  auto const byte1{get_byte(buffer, start)};
717  if (byte1 < 0x80)
718  return start + 1;
719 
720  if (start + 2 > buffer_len)
721  PQXX_UNLIKELY
722  throw_for_encoding_error("UTF8", buffer, start, buffer_len - start);
723 
724  auto const byte2{get_byte(buffer, start + 1)};
725  if (between_inc(byte1, 0xc0, 0xdf))
726  {
727  if (not between_inc(byte2, 0x80, 0xbf))
728  PQXX_UNLIKELY
729  throw_for_encoding_error("UTF8", buffer, start, 2);
730 
731  return start + 2;
732  }
733 
734  if (start + 3 > buffer_len)
735  PQXX_UNLIKELY
736  throw_for_encoding_error("UTF8", buffer, start, buffer_len - start);
737 
738  auto const byte3{get_byte(buffer, start + 2)};
739  if (between_inc(byte1, 0xe0, 0xef))
740  {
741  if (between_inc(byte2, 0x80, 0xbf) and between_inc(byte3, 0x80, 0xbf))
742  return start + 3;
743 
744  PQXX_UNLIKELY
745  throw_for_encoding_error("UTF8", buffer, start, 3);
746  }
747 
748  if (start + 4 > buffer_len)
749  PQXX_UNLIKELY
750  throw_for_encoding_error("UTF8", buffer, start, buffer_len - start);
751 
752  if (between_inc(byte1, 0xf0, 0xf7))
753  {
754  if (
755  between_inc(byte2, 0x80, 0xbf) and between_inc(byte3, 0x80, 0xbf) and
756  between_inc(get_byte(buffer, start + 3), 0x80, 0xbf))
757  return start + 4;
758 
759  PQXX_UNLIKELY
760  throw_for_encoding_error("UTF8", buffer, start, 4);
761  }
762 
763  PQXX_UNLIKELY
764  throw_for_encoding_error("UTF8", buffer, start, 1);
765  }
766 };
767 
768 
770 
784 constexpr inline encoding_group
785 map_ascii_search_group(encoding_group enc) noexcept
786 {
787  switch (enc)
788  {
789  case encoding_group::MONOBYTE:
790  case encoding_group::EUC_CN:
791  case encoding_group::EUC_JP:
792  case encoding_group::EUC_KR:
793  case encoding_group::EUC_TW:
794  case encoding_group::MULE_INTERNAL:
795  case encoding_group::UTF8:
796  // All these encodings are "ASCII-safe," meaning that if we're looking
797  // for a particular ASCII character, we can safely just go through the
798  // string byte for byte. Multibyte characters have the high bit set.
799  return encoding_group::MONOBYTE;
800 
801  default: PQXX_UNLIKELY return enc;
802  }
803 }
804 
805 
807 
813 template<char... NEEDLE>
814 PQXX_PURE constexpr inline char_finder_func *
815 get_char_finder(encoding_group enc)
816 {
817  auto const as_if{map_ascii_search_group(enc)};
818  switch (as_if)
819  {
820  case encoding_group::MONOBYTE:
821  return pqxx::internal::find_ascii_char<
822  encoding_group::MONOBYTE, NEEDLE...>;
823  case encoding_group::BIG5:
824  return pqxx::internal::find_ascii_char<encoding_group::BIG5, NEEDLE...>;
825  case encoding_group::GB18030:
826  return pqxx::internal::find_ascii_char<encoding_group::GB18030, NEEDLE...>;
827  case encoding_group::GBK:
828  return pqxx::internal::find_ascii_char<encoding_group::GBK, NEEDLE...>;
829  case encoding_group::JOHAB:
830  return pqxx::internal::find_ascii_char<encoding_group::JOHAB, NEEDLE...>;
831  case encoding_group::SJIS:
832  return pqxx::internal::find_ascii_char<encoding_group::SJIS, NEEDLE...>;
833  case encoding_group::UHC:
834  return pqxx::internal::find_ascii_char<encoding_group::UHC, NEEDLE...>;
835 
836  default:
838  "Unexpected encoding group: ", as_if, " (mapped from ", enc, ").")};
839  }
840 }
841 
842 
844 
847 template<char... NEEDLE>
848 PQXX_PURE constexpr inline char_finder_func *
849 get_s_char_finder(encoding_group enc)
850 {
851  auto const as_if{map_ascii_search_group(enc)};
852  switch (as_if)
853  {
854  case encoding_group::MONOBYTE:
856  encoding_group::MONOBYTE, NEEDLE...>;
857  case encoding_group::BIG5:
858  return pqxx::internal::find_s_ascii_char<encoding_group::BIG5, NEEDLE...>;
859  case encoding_group::GB18030:
861  encoding_group::GB18030, NEEDLE...>;
862  case encoding_group::GBK:
863  return pqxx::internal::find_s_ascii_char<encoding_group::GBK, NEEDLE...>;
864  case encoding_group::JOHAB:
865  return pqxx::internal::find_s_ascii_char<encoding_group::JOHAB, NEEDLE...>;
866  case encoding_group::SJIS:
867  return pqxx::internal::find_s_ascii_char<encoding_group::SJIS, NEEDLE...>;
868  case encoding_group::UHC:
869  return pqxx::internal::find_s_ascii_char<encoding_group::UHC, NEEDLE...>;
870 
871  default:
873  "Unexpected encoding group: ", as_if, " (mapped from ", enc, ").")};
874  }
875 }
876 } // namespace pqxx::internal
877 #endif
static PQXX_PURE std::size_t call(char const buffer[], std::size_t buffer_len, std::size_t start)
Find the next glyph in buffer after position start.
std::string concat(TYPE...item)
Efficiently combine a bunch of items into one big string.
Definition: concat.hxx:31
void for_glyphs(encoding_group enc, CALLABLE callback, char const buffer[], std::size_t buffer_len, std::size_t start=0)
Iterate over the glyphs in a buffer.
Definition: encodings.hxx:87
Internal items for libpqxx' own use. Do not use these yourself.
Definition: encodings.cxx:32
PQXX_PURE constexpr char_finder_func * get_char_finder(encoding_group enc)
Look up a character search function for an encoding group.
Definition: encodings.hxx:815
PQXX_PURE constexpr char_finder_func * get_s_char_finder(encoding_group enc)
Look up a "sentry" character search function for an encoding group.
Definition: encodings.hxx:849
Wrapper struct template for "find next glyph" functions.
Definition: encodings.hxx:142
The home of all libpqxx classes, functions, templates, etc.
Definition: array.cxx:26
Invalid argument passed to libpqxx, similar to std::invalid_argument.
Definition: except.hxx:265
std::size_t(char const buffer[], std::size_t buffer_len, std::size_t start) glyph_scanner_func
Function type: "find the end of the current glyph.".
Definition: encoding_group.hxx:53
constexpr encoding_group map_ascii_search_group(encoding_group enc) noexcept
Just for searching an ASCII character, what encoding can we use here?
Definition: encodings.hxx:785
std::size_t find_char(glyph_scanner_func *scanner, std::string_view haystack, std::size_t here=0u)
Find any of the ASCII characters NEEDLE in haystack.
Definition: encodings.hxx:52
std::size_t(std::string_view haystack, std::size_t start) char_finder_func
Function type: "find first occurrence of specific any of ASCII characters.".
Definition: encoding_group.hxx:71
pqxx::internal::encoding_group enc_group(std::string_view encoding_name)
Convert libpq encoding name to its libpqxx encoding group.
Definition: encodings.cxx:35
Internal error in libpqxx library.
Definition: except.hxx:241
PQXX_PURE std::size_t find_s_ascii_char(std::string_view haystack, std::size_t here)
Find first of NEEDLE ASCII chars in haystack.
Definition: encodings.hxx:211
PQXX_LIBEXPORT glyph_scanner_func * get_glyph_scanner(encoding_group)
Look up the glyph scanner function for a given encoding group.
PQXX_PURE char const * name_encoding(int encoding_id)
Return PostgreSQL's name for encoding enum value.