AsciiDocParser.java

package pro.verron.asciidoc.core;

import org.jspecify.annotations.NonNull;

import java.util.*;
import java.util.function.Function;

/// The [AsciiDocParser] class is a utility for parsing AsciiDoc-formatted text
/// and transforming it into structured models. It provides both static and
/// instance-based parsing capabilities and is designed to work with inline
/// structures within the text.
///
/// ## Methods
///   - [#parse(String)]: A static method to parse an AsciiDoc string into an
/// [AsciiDocModel].
///   - [#apply(String)]: An instance method implementing the [Function]
/// interface for parsing an AsciiDoc string
/// into an [AsciiDocModel].
///
/// ## Internal Behavior
///   - [#parseInlines(String)]: A private static helper method to parse inline
/// elements from a given text input.
public final class AsciiDocParser
        implements Function<String, AsciiDocModel> {

    /// Constructs an [AsciiDocParser].
    public AsciiDocParser() {}

    /// Parses the given AsciiDoc string and produces an [AsciiDocModel].
    ///
    /// @param asciidoc the AsciiDoc content to parse
    ///
    /// @return an [AsciiDocModel] representing the parsed structure
    public static AsciiDocModel parse(String asciidoc) {
        return new AsciiDocParser().apply(asciidoc);
    }

    /// Processes an AsciiDoc-formatted string and converts it into an
    /// [AsciiDocModel].
    ///
    /// The resulting model contains structured blocks such as paragraphs,
    /// headings,
    /// lists, tables, images, code blocks, and blockquotes.
    ///
    /// @param asciidoc the AsciiDoc-formatted input string
    ///
    /// @return an [AsciiDocModel] representing the parsed blocks; empty model
    /// if input is blank
    public AsciiDocModel apply(String asciidoc) {
        if (asciidoc.isBlank()) return AsciiDocModel.of(new ArrayList<>());

        var lines = asciidoc.split("\r?\n");
        var attributes = new HashMap<String, String>();
        var lineIndex = 0;
        while (lineIndex < lines.length) {
            var line = lines[lineIndex].trim();
            if (line.startsWith(":") && line.contains(":")) {
                var secondColon = line.indexOf(':', 1);
                if (secondColon != -1) {
                    var key = line.substring(1, secondColon);
                    var value = line.substring(secondColon + 1)
                                    .trim();
                    attributes.put(key, value);
                    lineIndex++;
                }
                else break;
            }
            else if (line.isEmpty()) lineIndex++;
            else break;
        }

        var currentParagraph = new StringBuilder();
        var inTable = false;
        var inBlockquote = false;
        var inCodeBlock = false;
        var currentTableRows = new ArrayList<Row>();
        var currentBlockContent = new StringBuilder();
        var nextBlockHeader = new ArrayList<String>();
        var inOpenBlock = false;
        var openBlockBlocks = new ArrayList<Block>();
        var openBlockHeader = new ArrayList<String>();

        var blocks = new ArrayList<Block>();
        var currentContainer = blocks;

        for (int i = lineIndex; i < lines.length; i++) {
            var line = lines[i];
            var trimmed = line.trim();

            if (trimmed.startsWith("[") && trimmed.endsWith("]") && !inCodeBlock
                && !inBlockquote && !inTable) {
                nextBlockHeader.addAll(Arrays.asList(trimmed.substring(1,
                                                                    trimmed.length() - 1)
                                                            .split(",")));
                continue;
            }

            if (trimmed.equals("--") && !inCodeBlock && !inBlockquote
                && !inTable) {
                if (!currentParagraph.isEmpty()) {
                    currentContainer.add(createParagraph(nextBlockHeader,
                            currentParagraph.toString()));
                    nextBlockHeader.clear();
                    currentParagraph.setLength(0);
                }

                if (inOpenBlock) {
                    blocks.add(new OpenBlock(new ArrayList<>(openBlockHeader),
                            new ArrayList<>(openBlockBlocks)));
                    openBlockBlocks.clear();
                    openBlockHeader.clear();
                    inOpenBlock = false;
                    currentContainer = blocks;
                }
                else {
                    inOpenBlock = true;
                    openBlockHeader = new ArrayList<>(nextBlockHeader);
                    nextBlockHeader.clear();
                    currentContainer = openBlockBlocks;
                }
                continue;
            }

            if (trimmed.equals("____")) {
                if (inBlockquote) {
                    var currentBlockContentString =
                            currentBlockContent.toString();
                    currentContainer.add(new QuoteBlock(parseInlines(
                            currentBlockContentString.trim())));
                    currentBlockContent.setLength(0);
                    inBlockquote = false;
                }
                else {
                    if (!currentParagraph.isEmpty()) {
                        currentContainer.add(createParagraph(nextBlockHeader,
                                currentParagraph.toString()));
                        nextBlockHeader.clear();
                        currentParagraph.setLength(0);
                    }

                    inBlockquote = true;
                }
                continue;
            }

            if (inBlockquote) {
                if (!currentBlockContent.isEmpty()) {
                    currentBlockContent.append(" ");
                }
                currentBlockContent.append(trimmed);
                continue;
            }

            if (trimmed.equals("----")) {
                if (inCodeBlock) {
                    var currentBlockContentString =
                            currentBlockContent.toString();
                    var language = (nextBlockHeader.size() > 1)
                            ? nextBlockHeader.get(1)
                            : "";
                    currentContainer.add(new CodeBlock(language,
                            currentBlockContentString.trim()));
                    currentBlockContent.setLength(0);
                    nextBlockHeader.clear();
                    inCodeBlock = false;
                }
                else {
                    if (!currentParagraph.isEmpty()) {
                        currentContainer.add(createParagraph(nextBlockHeader,
                                currentParagraph.toString()));
                        nextBlockHeader.clear();
                        currentParagraph.setLength(0);
                    }

                    inCodeBlock = true;
                }
                continue;
            }

            if (inCodeBlock) {
                if (!currentBlockContent.isEmpty()) {
                    currentBlockContent.append("\n");
                }
                currentBlockContent.append(line); // Preserve indentation in
                // code blocks
                continue;
            }

            if (trimmed.startsWith("image::")) {
                if (!currentParagraph.isEmpty()) {
                    currentContainer.add(createParagraph(nextBlockHeader,
                            currentParagraph.toString()));
                    nextBlockHeader.clear();
                    currentParagraph.setLength(0);
                }

                int endUrl = trimmed.indexOf('[', 7);
                if (endUrl != -1) {
                    int endText = trimmed.indexOf(']', endUrl);
                    if (endText != -1) {
                        String url = trimmed.substring(7, endUrl);
                        String altText = trimmed.substring(endUrl + 1, endText);
                        currentContainer.add(new ImageBlock(url, altText));
                        continue;
                    }
                }
            }

            if (trimmed.contains("::[")) {
                int macroNameEnd = trimmed.indexOf("::[");
                String macroName = trimmed.substring(0, macroNameEnd);
                int endBracket = trimmed.indexOf(']', macroNameEnd + 3);
                if (endBracket != -1) {
                    if (!currentParagraph.isEmpty()) {
                        currentContainer.add(createParagraph(nextBlockHeader,
                                currentParagraph.toString()));
                        nextBlockHeader.clear();
                        currentParagraph.setLength(0);
                    }

                    String content = trimmed.substring(macroNameEnd + 3,
                            endBracket);
                    // Simple attribute parser for id="value", key="value"
                    var attrs = new ArrayList<String>();
                    String id = "";
                    String[] parts = content.split(",\\s*");
                    for (String part : parts) {
                        if (part.startsWith("id=\"")
                            || part.startsWith("id=")) {
                            id = part.substring(part.indexOf('=') + 1)
                                     .replace("\"", "");
                        }
                        else {
                            attrs.add(part.trim());
                        }
                    }
                    var macroBlock = new MacroBlock(attrs, macroName, id);
                    currentContainer.add(macroBlock);
                    continue;
                }
            }

            if (trimmed.equals("|===")) {
                if (inTable) {
                    currentContainer.add(new Table(currentTableRows));
                    currentTableRows = new ArrayList<>();
                    inTable = false;
                }
                else {
                    if (!currentParagraph.isEmpty()) {
                        currentContainer.add(createParagraph(nextBlockHeader,
                                currentParagraph.toString()));
                        nextBlockHeader.clear();
                        currentParagraph.setLength(0);
                    }

                    inTable = true;
                }
                continue;
            }

            if (inTable) {
                if (trimmed.startsWith("|")) {
                    String[] cellTexts = trimmed.substring(1)
                                                .split("\\|");
                    List<Cell> cells = new ArrayList<>();
                    for (String cellText : cellTexts) {
                        cells.add(Cell.ofInlines(parseInlines(cellText.trim())));
                    }
                    currentTableRows.add(new Row(cells));
                }
                continue;
            }

            if (trimmed.isBlank()) {
                if (!currentParagraph.isEmpty()) {
                    currentContainer.add(createParagraph(nextBlockHeader,
                            currentParagraph.toString()));
                    nextBlockHeader.clear();
                    currentParagraph.setLength(0);
                }

                continue;
            }

            // Check for Headings
            if (trimmed.startsWith("=")) {
                int level = 0;
                while (level < trimmed.length()
                       && trimmed.charAt(level) == '=') {
                    level++;
                }
                if (level > 0 && level <= 6 && level < trimmed.length()
                    && Character.isWhitespace(trimmed.charAt(level))) {
                    if (!currentParagraph.isEmpty()) {
                        currentContainer.add(createParagraph(nextBlockHeader,
                                currentParagraph.toString()));
                        nextBlockHeader.clear();
                        currentParagraph.setLength(0);
                    }

                    var title = trimmed.substring(level);
                    title = title.trim();
                    currentContainer.add(new Heading(new ArrayList<>(
                            nextBlockHeader), level, parseInlines(title)));
                    nextBlockHeader.clear();
                    continue;
                }
            }

            // Check for Unordered List Item
            if (trimmed.startsWith("* ")) {
                if (!currentParagraph.isEmpty()) {
                    currentContainer.add(createParagraph(nextBlockHeader,
                            currentParagraph.toString()));
                    nextBlockHeader.clear();
                    currentParagraph.setLength(0);
                }

                var itemText = trimmed.substring(2);
                itemText = itemText.trim();
                var item = new ListItem(parseInlines(itemText));
                if (!currentContainer.isEmpty()
                    && currentContainer.getLast() instanceof UnorderedList(
                        List<ListItem> items1
                )) {
                    List<ListItem> items = new ArrayList<>(items1);
                    items.add(item);
                    currentContainer.set(currentContainer.size() - 1,
                            new UnorderedList(items));
                }
                else currentContainer.add(new UnorderedList(List.of(item)));
                continue;
            }

            // Check for Ordered List Item
            if (trimmed.startsWith(". ")) {
                if (!currentParagraph.isEmpty()) {
                    currentContainer.add(createParagraph(nextBlockHeader,
                            currentParagraph.toString()));
                    nextBlockHeader.clear();
                    currentParagraph.setLength(0);
                }

                String itemText = trimmed.substring(2)
                                         .trim();
                var item = new ListItem(parseInlines(itemText));
                if (!currentContainer.isEmpty()
                    && currentContainer.getLast() instanceof OrderedList(
                        List<ListItem> items1
                )) {
                    List<ListItem> items = new ArrayList<>(items1);
                    items.add(item);
                    currentContainer.set(currentContainer.size() - 1,
                            new OrderedList(items));
                }
                else currentContainer.add(new OrderedList(List.of(item)));
                continue;
            }

            // Otherwise, it's a paragraph part
            if (!currentParagraph.isEmpty()) {
                currentParagraph.append("\n");
            }
            currentParagraph.append(trimmed);
        }

        if (!currentParagraph.isEmpty()) {
            var currentParagraphString = currentParagraph.toString();
            var inlines = parseInlines(currentParagraphString.trim());
            currentContainer.add(createParagraph(nextBlockHeader, inlines));
        }

        return AsciiDocModel.of(attributes, blocks);
    }

    private static @NonNull Paragraph createParagraph(
            ArrayList<String> nextBlockHeader,
            String string
    ) {
        return createParagraph(nextBlockHeader, parseInlines(string.trim()));
    }

    private static List<Inline> parseInlines(String text) {
        // Stack-based inline parser with simple tokens for '*', '_', text,
        // and escapes.
        // Non-overlapping nesting is allowed; crossing markers are treated
        // as plain text.
        var root = new Frame(FrameType.ROOT);
        var stack = new ArrayList<Frame>();
        stack.add(root);

        if (text.isEmpty()) return root.children;

        for (int i = 0; i < text.length(); i++) {
            char c = text.charAt(i);

            // Escapes for '*', '_', and '\\'
            if (c == '\\') {
                if (i + 1 < text.length()) {
                    char next = text.charAt(i + 1);
                    if (next == '*' || next == '_' || next == '\\') {
                        stack.getLast().text.append(next);
                        i++;
                        continue;
                    }
                }
                // Lone backslash
                stack.getLast().text.append(c);
                continue;
            }

            if (c == '*' || c == '_') {
                FrameType type = (c == '*') ? FrameType.BOLD : FrameType.ITALIC;
                Frame top = stack.getLast();
                if (top.type == type) {
                    // Close current frame
                    top.flushTextToChildren();
                    Inline node = (type == FrameType.BOLD)
                            ? new Bold(top.children)
                            : new Italic(top.children);
                    stack.removeLast();
                    Frame parent = stack.getLast();
                    parent.children.add(node);
                }
                else if (top.type == FrameType.BOLD
                         || top.type == FrameType.ITALIC
                         || top.type == FrameType.ROOT) {
                    // Open new frame
                    top.flushTextToChildren();
                    Frame f = new Frame(type);
                    stack.add(f);
                }
                else {
                    // Should not happen
                    stack.getLast().text.append(c);
                }
                continue;
            }

            // Detect literal |TAB| token -> emit a Tab inline
            if (c == '|' && i + 4 < text.length() && text.charAt(i + 1) == 'T'
                && text.charAt(i + 2) == 'A' && text.charAt(i + 3) == 'B'
                && text.charAt(i + 4) == '|') {
                // Flush any pending text
                stack.getLast()
                     .flushTextToChildren();
                stack.getLast().children.add(new Tab());
                i += 4;
                continue;
            }

            // Simple Link detection: https://example.com[Text]
            if (c == 'h' && text.startsWith("http", i)) {
                int endUrl = text.indexOf('[', i);
                if (endUrl != -1) {
                    int endText = text.indexOf(']', endUrl);
                    if (endText != -1) {
                        stack.getLast()
                             .flushTextToChildren();
                        String url = text.substring(i, endUrl);
                        String linkText = text.substring(endUrl + 1, endText);
                        stack.getLast().children.add(new Link(url, linkText));
                        i = endText;
                        continue;
                    }
                }
            }

            // Simple Image detection: image:url[AltText]
            if (c == 'i' && text.startsWith("image:", i)) {
                int endUrl = text.indexOf('[', i + 6);
                if (endUrl != -1) {
                    int endText = text.indexOf(']', endUrl);
                    if (endText != -1) {
                        stack.getLast()
                             .flushTextToChildren();
                        String url = text.substring(i + 6, endUrl);
                        String title = text.substring(endUrl + 1, endText);
                        stack.getLast().children.add(new ImageInline(url,
                                Map.of("title", title)));
                        i = endText;
                        continue;
                    }
                }
            }

            // Regular char
            stack.getLast().text.append(c);
        }

        // Unwind: any unclosed frames become literal markers + content as
        // plain text in parent
        while (stack.size() > 1) {
            Frame unfinished = stack.removeLast();
            char marker = unfinished.type == FrameType.BOLD ? '*' : '_';
            unfinished.flushTextToChildren();
            // Build literal: marker + children as text + (no closing marker
            // since it is missing)
            StringBuilder literal = new StringBuilder();
            literal.append(marker);
            for (Inline in : unfinished.children) {
                literal.append(in.text());
            }
            stack.getLast().text.append(literal);
        }

        // Flush remainder text on root
        root.flushTextToChildren();
        return root.children;
    }

    private static @NonNull Paragraph createParagraph(
            ArrayList<String> nextBlockHeader,
            List<Inline> string
    ) {
        return new Paragraph(new ArrayList<>(nextBlockHeader), string);
    }

    private enum FrameType {
        ROOT,
        BOLD,
        ITALIC
    }

    private static final class Frame {
        final FrameType type;
        final List<Inline> children = new ArrayList<>();
        final StringBuilder text = new StringBuilder();

        Frame(FrameType type) {this.type = type;}

        void flushTextToChildren() {
            if (!text.isEmpty()) {
                children.add(new Text(text.toString()));
                text.setLength(0);
            }
        }
    }
}