Literate Programming 4

The overall program layout looks like this:

__main__

File Header
Library Imports
Definitions
Utility Functions
Main Function
Comments to test appending and replacing

////////////
// WEB4.D
//
// This is a level 4 bootstrapping Literate Programming thing.
// It will insert indices and tables of contents.  It may also allow appending or replacing sections.
//
module web3;

Library Imports

private import std.algorithm; // Needed for countUntil and searching
private import std.ascii;     // Character type checks.
private import std.file;      // Needed for file input and output
private import std.stdio;     // Needed for error reporting and my debugging
private import std.string;    // These programs are all about string processing.

I've converted the bool isCode/isIdentifier into a consistent enum and started tracking line numbers within each text block.

Definitions

enum ESectionType {
    CODE,
    HEADER,
    PARAGRAPH,
    IDENTIFIER,
    INDEX_TERM,
    PRE, // Terms?  Literal/Emphasis?
    BOLD,
};

struct SSection {
    string name;
    ESectionType type;
    SBlock[] contents;
};

struct SBlock {
    ESectionType type;
    int lineNumber;
    string content;
};

New character definitions

Utility Functions

    An enhanced version of countUntil that can start at a given string position.
    Format code for display with colors and escaped HTML codes.
    Escape special HTML Characters with their safe entities.
    Parse an entire section of text, recursing for definitions if needed.
    Find Matching Identifiers
    expand_code_identifier
    parse_web_then_tangle_and_weave

I'm inserting these hide blocks here. I wish I could insert bits of display inside a code block.

An enhanced version of countUntil that can start at a given string position.

ptrdiff_t countFromPosUntil(string haystack, ptrdiff_t startIndex, string needle)
{
    ptrdiff_t offset = countUntil(haystack[startIndex..haystack.length], needle);
    if(offset < 0) {
        return offset;
    }
    return startIndex + offset;
}

Format code for display with colors and escaped HTML codes.

string formatCodeForDisplay(string source, int lineNumber)
{
    string output = "";
    string scanner = escapeHTMLCharacters(source);
    scanner: while(!scanner.empty) {
        if(scanner.startsWith("//")) {
            // Color comments.
            output ~= "<span class=\"code_comment\">";
            int lineLength = countUntil(scanner, "\n");
            if(lineLength < 0) {
                lineLength = scanner.length;
            }
            output ~= scanner[0..lineLength - 1];
            output ~= "</span>";
            scanner = scanner[lineLength..scanner.length];

        } else if(scanner.startsWith("\"") || scanner.startsWith("\'")) {
            // Color strings.
            char stringType = scanner[0];
            output ~= "<span class=\"code_string\">";
            int stringLength = 1;
            while(stringLength < scanner.length && scanner[stringLength] != stringType) {
                if(scanner[stringLength] == '\\') {
                    stringLength += 1;
                }
                stringLength += 1;
            }
            if(stringLength >= scanner.length) {
                writefln("ERROR: Unable to find close quote for string %s near line %d in string %s\n", scanner[0..min(scanner.length, 20)], lineNumber, source);
                break scanner;
            }
            output ~= scanner[0..stringLength + 1];
            output ~= "</span>";
            scanner = scanner[stringLength + 1..scanner.length];

        } else {
            if(isAlpha(scanner[0])) {
                bool isNotIdentifier(dchar ch) { return !(isAlpha(ch) || isDigit(ch) || ch == '_'); }
                int wordLength = countUntil!isNotIdentifier(scanner);
                if(wordLength < 0) {
                    wordLength = scanner.length;
                }
                
                const string[] identifiers = [ "const", "bool", "break", "char", "dchar", "else",
                    "for", "if", "import", "int", "main", "module", "private", "return", "string",
                    "std", "void", "while", ];
                
                if(wordLength > 0 && !findAmong(identifiers, [scanner[0..wordLength]]).empty) {
                    // Special identifiers
                    output ~= "<span class=\"code_identifier\">";
                    output ~= scanner[0..wordLength];
                    output ~= "</span>";
                } else {
                    output ~= scanner[0..wordLength];
                }
                scanner = scanner[wordLength..scanner.length];

            } else {
                output ~= scanner[0];
                scanner = scanner[1..scanner.length];
            }
        }
    }
    
    return output;
}

Escape special HTML Characters with their safe entities.

string escapeHTMLCharacters(string source)
{
    string output;
    string scanner = source;
    foreach(dchar ch; source) {
        if(countUntil("<>&", ch) >= 0) {
            if(ch == '<') {
                output ~= "&lt;";
            } else if(ch == '>') {
                output ~= "&gt;";
            } else if(ch == '&') {
                output ~= "&amp;";
            } else {
                writefln("BUG: Only partly implemented support for '%s'.", ch);
            }
        } else {
            output ~= ch;
        }
    }
    return output;
}

Parse an entire section of text, recursing for definitions if needed.

SBlock[] slurp_section(string contents, ref int offset, ref int lineNumber, bool recurse, ESectionType sectionType)
{
    SBlock[] results;
    string currentBlock = "";
    int startLineNumber = lineNumber;
    int index = offset;
    for(; index < contents.length; index++) {
        if(contents[index] == '@') {
            if(recurse && contents[index + 1] == '<') {
                results ~= SBlock(sectionType, startLineNumber, currentBlock);
                currentBlock = "";
                startLineNumber = lineNumber;
                
                slurp identifier subsection
                
                if(contents[index..$].startsWith("@>")) {
                    index += 2;
                } else {
                    writefln("Identifier '%s' invoked without close tag. at %s", identifier, contents[index..min($, index + 10)]);
                    break;
                }
            } else if(contents[index + 1] == '@') {
                currentBlock ~= contents[index];
                // Skip the escaped at symbol.
                index++;
                
            slurp styled section
                
            } else {
                break;
            }
        } 
        
        Handle embedded index points
        
        else {
            if(contents[index] == '\n') {
                lineNumber++;
            }
            currentBlock ~= contents[index];
        }
    }
    results ~= SBlock(ESectionType.CODE, startLineNumber, currentBlock);
    
    offset = index;
    
    return results;
}

expand_code_identifier

string expand_code_identifier(SSection[] sections, string identifier, string inputFilename)
{
    string output;
    output ~= "/* from "~identifier~" */";

    SSection[] definitions = find_matching_identifiers(sections, identifier);
    if(definitions.empty) {
        writefln("ERROR: Unable to find identifier '%s'.", identifier);
        return format("ERROR: %s is undefined", identifier);
    }

    foreach(section; definitions) {
        foreach(block; section.contents)  {
            output ~= format("\n#line %d \"%s\"\n", block.lineNumber, inputFilename);
        
            if(block.type == ESectionType.IDENTIFIER) {
                output ~= expand_code_identifier(sections, block.content, inputFilename);
            } else {
                output ~= block.content;
            }
        }
    
    }

    return output;
}

parse_web_then_tangle_and_weave


void parse_web_then_tangle_and_weave(ref string outputDisplayContents, ref string outputCodeContents, string fileContents, string inputFilename)
{    
    SSection[] fileSections;

    int lineNumber = 0;
    int charIndex = 0;
    while(charIndex < fileContents.length) {
        dchar ch = fileContents[charIndex];
        
        if(ch == '@') { 
        
            dchar chNext = charIndex < fileContents.length - 1 ? fileContents[charIndex + 1] : 0;
        
            charIndex += 2;
            if(chNext == '@') {
                // It's just an escaped at.  Continue parsing.
            } else if(chNext == 'p') {
            
                fileSections ~= SSection("__main__", ESectionType.CODE, slurp_section(fileContents, charIndex, lineNumber, true, ESectionType.CODE));
   
            } else if(chNext == '>') {
                //End tag.  This should be the end of this block.

            } else if(chNext == '<') {
                SBlock[] identifierBlocks = slurp_section(fileContents, charIndex, lineNumber, false, ESectionType.IDENTIFIER);
                assert(identifierBlocks.length == 1);
                string identifier = identifierBlocks[0].content;
                
                SBlock[] sectionContents;
                if(fileContents[charIndex..charIndex + 3] == "@>=") {
                    charIndex += 3;
                     sectionContents = slurp_section(fileContents, charIndex, lineNumber, true, ESectionType.CODE);
                    
                } else {
                    writefln("Identifier '%s' invoked outside program and not a definition.", identifier);
                }
                
                fileSections ~= SSection(identifier, ESectionType.CODE, sectionContents);
                
            } else if(chNext == '*') {
                int titleEndingPeriod = countFromPosUntil(fileContents, charIndex, ".");
                string title = "";
                if(titleEndingPeriod > 0) {
                    title = fileContents[charIndex..titleEndingPeriod];
                    charIndex = titleEndingPeriod + 1;
                }
                
                fileSections ~= SSection(title, ESectionType.HEADER, slurp_section(fileContents, charIndex, lineNumber, false, ESectionType.HEADER));
  
            } else {
                // '@ ' will be converted into a section.
                fileSections ~= SSection("", ESectionType.PARAGRAPH, SBlock(ESectionType.PARAGRAPH, lineNumber, "<p>") ~ slurp_section(fileContents, charIndex, lineNumber, false, ESectionType.PARAGRAPH));
            }

        } else {
            fileSections ~= SSection("", ESectionType.PARAGRAPH, slurp_section(fileContents, charIndex, lineNumber, false, ESectionType.PARAGRAPH));
        }
    }
    
    New Display Work in parse_web_then_tangle_and_weave
    
    New support for main sections
    
    foreach(block; mainSection[0].contents) {
        if(block.type == ESectionType.IDENTIFIER) {
            outputCodeContents ~= expand_code_identifier(fileSections, block.content, inputFilename);
        } else {
            outputCodeContents ~= block.content;
        }
    }
}

Main Function

void main(string[] args)
{
    if(args.length != 4) {
        writefln("Usage: WEB3 inputFile outputHTMLFile outputCodeFile");
    }
    const string inputFilename = args[1];
    string fileContents = cast(string) std.file.read(inputFilename);
    if(fileContents.length == 0) {
        writefln("Unable to read file '%s'.", inputFilename);
        return;
    }

    string outputDisplayContents = "";
    string outputCodeContents = "";
    parse_web_then_tangle_and_weave( outputDisplayContents, outputCodeContents, fileContents, inputFilename);

    string outputDisplayFilename = args[2];
    std.file.write(outputDisplayFilename, outputDisplayContents);

    string outputCodeFilename = args[3];
    std.file.write(outputCodeFilename, outputCodeContents);
}

Literate Programming 4

WEB 4 - Indexes and Tables of Content

Existing code

Polishing the display

Code Scanner Bug

Generating a Table of Contents

Table of Contents:

Generating an Index

Index:

Explicit Main Sections

Better Section Management

Custom Index Terms

Styled Text