Common Tasks¶

This page is the recipe layer above the full reference. The examples here are small, runnable, and compiled in CI.

Validate runtime UTF-8 once¶

When text arrives as raw bytes, validate it once and keep the validated type:

#include "unicode_ranges_all.hpp"

#include <print>
#include <string>

using namespace unicode_ranges;

int main()
{
    std::string raw = "Grüße din România 👋";

    auto text = utf8_string::from_bytes(raw);
    if (!text)
    {
        std::println(stderr,
            "Invalid UTF-8 at byte {}",
            text.error().first_invalid_element_index);
        return 1;
    }

    std::println("{}", *text);                  // Grüße din România 👋
    std::println("{}", text->char_count());     // 18
    std::println("{}", text->front().value());  // G
    std::println("{}", text->back().value());   // 👋
}

Iterate scalars versus graphemes¶

Use chars() when you need Unicode scalar values. Use graphemes() when you need user-perceived characters.

#include "unicode_ranges_all.hpp"

#include <print>

using namespace unicode_ranges;
using namespace unicode_ranges::literals;

int main()
{
    constexpr auto text = "é🇷🇴!"_utf8_sv;

    std::println("{}", text);                   // é🇷🇴!
    std::println("{}", text.size());            // 12 UTF-8 code units
    std::println("{}", text.char_count());      // 5 Unicode scalars
    std::println("{}", text.grapheme_count());  // 3 graphemes
    std::println("{}", text.find("!"_u8c));     // 11
    std::println("{}", text.find("🇷"_u8c));    // 3

    std::println("{}", text.chars());          // [e, ́, 🇷, 🇴, !]
    std::println("{::s}", text.graphemes());   // [é, 🇷🇴, !]
}

Inspect grapheme boundaries¶

When byte or code-unit offsets matter, use the explicit boundary helpers rather than assuming every scalar boundary is also a grapheme boundary:

#include "unicode_ranges_all.hpp"

#include <print>

using namespace unicode_ranges;
using namespace unicode_ranges::literals;

int main()
{
    constexpr auto text = "é🇷🇴!"_utf8_sv;

    std::println("{}", text.is_char_boundary(1));         // true
    std::println("{}", text.is_grapheme_boundary(1));     // false
    std::println("{}", text.ceil_grapheme_boundary(7));   // 11
    std::println("{}", text.floor_grapheme_boundary(7));  // 3

    std::println("{}", text.chars());          // [e, ́, 🇷, 🇴, !]
    std::println("{::s}", text.graphemes());   // [é, 🇷🇴, !]
}

Choose ASCII-only versus Unicode-aware casing¶

ASCII-only transforms are intentionally narrow and fast. Unicode-aware transforms are the ones to use for real text:

#include "unicode_ranges_all.hpp"

#include <print>

using namespace unicode_ranges;
using namespace unicode_ranges::literals;

int main()
{
    constexpr auto text = "straße café"_utf8_sv;

    std::println("{}", text.to_ascii_uppercase());       // STRAßE CAFé
    std::println("{}", text.to_uppercase());             // STRASSE CAFÉ
    std::println("{}", "CAFÉ Ω"_utf8_sv.to_lowercase()); // café ω
}

Normalize before equality on equivalent spellings¶

Visually identical Unicode text can be spelled with different scalar sequences. Normalize both sides before comparing:

#include "unicode_ranges_all.hpp"

#include <print>

using namespace unicode_ranges;
using namespace unicode_ranges::literals;

int main()
{
    const auto lhs = u8"é"_utf8_sv.to_nfd();
    const auto rhs = u8"e\u0301"_utf8_sv.to_nfd();

    std::println("{}", lhs == rhs);  // true
}

Convert UTF-8, UTF-16, and UTF-32¶

Validated UTF-8, UTF-16, and UTF-32 text can be converted directly without falling back to raw transcoding code:

#include "unicode_ranges_all.hpp"

#include <print>

using namespace unicode_ranges;
using namespace unicode_ranges::literals;

int main()
{
    const auto utf8 = u8"Grüße 👋"_utf8_sv;
    const auto utf16 = utf8.to_utf16();
    const auto utf32 = utf8.to_utf32();

    std::println("{}", utf16);           // Grüße 👋
    std::println("{}", utf32);           // Grüße 👋
    std::println("{}", utf16.to_utf8()); // Grüße 👋
    std::println("{}", utf32.to_utf8()); // Grüße 👋
}