Prev:WEB 2 Top:WEB 0 Next:WEB 4
Jon Breuer - September 8, 2024.
Now I'd like to write parts of this program out of order. Or I want them in conceptual order instead in compiling order. This will require naming sections and then slurping multiple named sections into a group.
Also, now that I can clean up the code for the display html, the source is here (literate_programming_3_source.html) and this is the generated file.
//////////// // WEB3.D // // This is a level 3 bootstrapping Literate Programming thing. // It will start tangling code and weaving HTML from a variety of source blocks. // module web3; private import std.algorithm; // Needed for countUntil and searching private import std.ascii; // Character type checks. private import std.file; // Needed for file input and output private import std.stdio; // Needed for error reporting and my debugging private import std.string; // These programs are all about string processing. ptrdiff_t countFromPosUntil(string haystack, ptrdiff_t startIndex, string needle) { ptrdiff_t offset = countUntil(haystack[startIndex..haystack.length], needle); if(offset < 0) { return offset; } return startIndex + offset; } string formatCodeForDisplay(string source, int lineNumber) { string output = ""; string scanner = escapeHTMLCharacters(source); scanner: while(!scanner.empty) { if(scanner.startsWith("//")) { // Color comments. output ~= "<span class=\"code_comment\">"; int lineLength = countUntil(scanner, "\n"); if(lineLength < 0) { lineLength = scanner.length; } output ~= scanner[0..lineLength - 1]; output ~= "</span>"; scanner = scanner[lineLength..scanner.length]; } else if(scanner.startsWith("\"") || scanner.startsWith("\'")) { // Color strings. char stringType = scanner[0]; output ~= "<span class=\"code_string\">"; int stringLength = 1; while(stringLength < scanner.length && scanner[stringLength] != stringType) { if(scanner[stringLength] == '\\') { stringLength += 1; } stringLength += 1; } if(stringLength >= scanner.length) { writefln("Unable to find close quote for string %s near line %d in string %s\n", scanner[0..min(scanner.length, 20)], lineNumber, source); break scanner; } output ~= scanner[0..stringLength + 1]; output ~= "</span>"; scanner = scanner[stringLength + 1..scanner.length]; } else { if(isAlpha(scanner[0])) { bool isNotIdentifier(dchar ch) { return !(isAlpha(ch) || isDigit(ch) || ch == '_'); } int wordLength = countUntil!isNotIdentifier(scanner); if(wordLength < 0) { wordLength = scanner.length; } const string[] identifiers = [ "const", "bool", "break", "char", "dchar", "else", "for", "if", "import", "int", "main", "module", "private", "return", "string", "std", "void", "while", ]; if(wordLength > 0 && !findAmong(identifiers, [scanner[0..wordLength]]).empty) { // Special identifiers output ~= "<span class=\"code_identifier\">"; output ~= scanner[0..wordLength]; output ~= "</span>"; } else { output ~= scanner[0..wordLength]; } scanner = scanner[wordLength..scanner.length]; } else { output ~= scanner[0]; scanner = scanner[1..scanner.length]; } } } return output; } string escapeHTMLCharacters(string source) { string output; string scanner = source; foreach(dchar ch; source) { if(countUntil("<>&", ch) >= 0) { if(ch == '<') { output ~= "<"; } else if(ch == '>') { output ~= ">"; } else if(ch == '&') { output ~= "&"; } else { writefln("BUG: Only partly implemented support for '%s'.", ch); } } else { output ~= ch; } } return output; } void main(string[] args) { if(args.length != 4) { writefln("Usage: WEB3 inputFile outputHTMLFile outputCodeFile"); } const string inputFilename = args[1]; string fileContents = cast(string) std.file.read(inputFilename); if(fileContents.length == 0) { writefln("Unable to read file '%s'.", inputFilename); return; }
I've cut out the file processor and moved it into a new function we'll write after the discussion.
string outputDisplayContents = ""; string outputCodeContents = ""; parse_web_then_tangle_and_weave( outputDisplayContents, outputCodeContents, fileContents, inputFilename);
Existing code:
string outputDisplayFilename = args[2]; std.file.write(outputDisplayFilename, outputDisplayContents); string outputCodeFilename = args[3]; std.file.write(outputCodeFilename, outputCodeContents); }
There were also two places I found where a comment without a newline, taking up the entire string would cause an infinite parse loop.
int lineLength = countUntil(scanner, "\n"); if(lineLength < 0) { lineLength = scanner.length; }
I've been re-reading (http://www.literateprogramming.com/lpsimp.pdf) and that's a modified LP system called NoWeb. I reread (http://www.literateprogramming.com/knuthweb.pdf) and noticed differences between what I'm attempting and what Knuth accomplished.
I have been treating *.WEB files as TeX files with added WEB markup, that Knuth is writing primarily in TeX with the examples being pulled. That's what my files are - HTML with code insertions. This isn't quite accurate. Every section of a *.WEB file is a WEB command, and the display sections support embedded display language constructs.
Codes I don't support.
WEB also supports a fair amount of recursive embedding which I don't yet.
My work thus far has been taking WEB0 and using it to compile both literate_programming_0.html and literate_programming_1.html, then continuing that process with WEB1. I think I need to create a side example file that has the features I want WEB3 to support.
Snippet from examples.
@p // Start my program. @c @<Header files to include@>@/ @<Global variables@>@/ @<Functions@>@/ @<The main program@> @ Any other section to break the first definitions.
I don't know what @/ means.
@<The main...@>= main (argc,argv) int argc; /* the number of arguments on the \UNIX/ command line */ char **argv; /* the arguments themselves, an array of strings */ { @<Variables local to |main|@>@; prog_name=argv[0]; @<Set up option selection@>; writefln("asdf"); @<Process all the files@>; writefln("sdfg"); @<Print the grand totals if there were multiple files @>; exit(status); }Inferences: @ means insert the semicolon into C/Pascal, but don't render it in the display file.
I think I can treat @<blah@> as a reference to an identifier and @<blah@>=contentas the definition. My tool thus far has been a glorified search-and-replace, but nested out of order identifiers will require more careful work.
My test snippet: (I'll copy and paste this out.) literate_programming_3_test.web Skip to the results
And now, let's rebuild the parser. We're going to need to track the different sections that we're assembling. * A section is either a display section or a code section. A display section should have 1 block of text. A code section might have many.
struct SSection { string name; bool isCode; SBlock[] contents; }; struct SBlock { bool isIdentifier; string content; };
This is the parsing function.
void parse_web_then_tangle_and_weave(ref string outputDisplayContents, ref string outputCodeContents, string fileContents, string inputFilename) { SSection[] fileSections;
We will scan through every character, @ control characters determine which section a block of text becomes.
int charIndex = 0; while(charIndex < fileContents.length) { dchar ch = fileContents[charIndex]; if(ch == '@') { dchar chNext = charIndex < fileContents.length - 1 ? fileContents[charIndex + 1] : 0;
Every @ character is a pair and we'll skip both of them every time. No extra work is needed for an escaped @@ sequence.
charIndex += 2; if(chNext == '@') {
Program blocks get turned into code.
} else if(chNext == 'p') { fileSections ~= SSection("__main__", true, slurp_section(fileContents, charIndex, true));
Close blocks don't need extra processing.
} else if(chNext == '>') {
Definition blocks.
} else if(chNext == '<') { SBlock[] identifierBlocks = slurp_section(fileContents, charIndex, false); assert(identifierBlocks.length == 1); string identifier = identifierBlocks[0].content; SBlock[] sectionContents; if(fileContents[charIndex..charIndex + 3] == "@>=") { charIndex += 3; sectionContents = slurp_section(fileContents, charIndex, true); } else { writefln("Identifier '%s' invoked outside program and not a definition.", identifier); } fileSections ~= SSection(identifier, true, sectionContents);
Display blocks with titles.
} else if(chNext == '*') { writefln("title section"); int titleEndingPeriod = countFromPosUntil(fileContents, charIndex, "."); string title = ""; if(titleEndingPeriod > 0) { title = fileContents[charIndex..titleEndingPeriod]; charIndex = titleEndingPeriod + 1; } fileSections ~= SSection(title, false, slurp_section(fileContents, charIndex, false));
Unknown blocks get turned into display.
} else { writefln("paragraph section"); // '@ ' will be converted into a section. fileSections ~= SSection("", false, slurp_section(fileContents, charIndex, false)); }
Raw text without a starting @ isn't legal WEB, but I default it to display text.
} else { writefln("raw display section"); fileSections ~= SSection("", false, slurp_section(fileContents, charIndex, false)); } } int lineNumber = 0; //TODO:// Move this up for debugging.
After parsing out the sections, format them for output.
Because this file is HTML with the occasional bit of WEB commands and thus far I've been using @p as the "any code" marker, @p gets a header inserted after every snippet. Add a hack to remove duplicates.
bool isSectionNamedMain(SSection section) { return section.name == "__main__"; } int numMainSections = count!isSectionNamedMain(fileSections); bool hideMainHeaders = numMainSections > 1; writefln("There are %d main sections.", numMainSections);
Start formatting.
foreach(SSection section; fileSections) {
Display sections get stuck into the display output.
if(!section.name.empty) { // A hack for this version. if(hideMainHeaders && section.name == "__main__") { } else { outputDisplayContents ~= "<h2>" ~ section.name ~"</h2>" ~ "<p>"; } } if(section.isCode == false) { string content = reduce!("a ~ b.content")("", section.contents); outputDisplayContents ~= content; lineNumber += count(content, '\n'); } else {
Code sections get formatted into the display and stuck into the code.
outputDisplayContents ~= "<pre>"; foreach(block; section.contents) { string outputContent = formatCodeForDisplay(block.content, lineNumber); if(block.isIdentifier) { outputDisplayContents ~= "<b><i>" ~ outputContent ~ "</i></b>"; } else { outputDisplayContents ~= outputContent; } } outputDisplayContents ~= "</pre>"; } } foreach(section; fileSections) { if(section.isCode && section.name == "__main__") { foreach(block; section.contents) { if(block.isIdentifier) { outputCodeContents ~= expand_code_identifier(fileSections, block.content); } else { outputCodeContents ~= block.content; } } } } }
A section is terminated by the start of a new section, so scan for @. This function will get more complicated soon.
SBlock[] slurp_section(string contents, ref int offset, bool recurse) { SBlock[] results; string currentBlock = ""; int index = offset; for(; index < contents.length; index++) { if(contents[index] == '@') { if(recurse && contents[index + 1] == '<') { results ~= SBlock(false, currentBlock); currentBlock = ""; int preIdentifierIndex = index; index += 2; SBlock[] identifierBlocks = slurp_section(contents, index, false); assert(identifierBlocks.length == 1); string identifier = identifierBlocks[0].content; if(contents[index..$].startsWith("@>=")) { // The end of one block has bumped into the start of another. Roll back. index = preIdentifierIndex; break; } // Now that we're sure this is a reference to an identifier and not a definition of a new identifier, continue. results ~= SBlock(true, identifier); if(contents[index..$].startsWith("@>")) { index += 2; } else { writefln("Identifier '%s' invoked without close tag. at %s", identifier, contents[index..min($, index + 10)]); break; } } else if(contents[index + 1] != '@') { break; } else { currentBlock ~= contents[index]; // Skip the escaped at symbol. index++; } } else { currentBlock ~= contents[index]; } } results ~= SBlock(false, currentBlock); offset = index; return results; }
Recursively expand an identifier into its definitions.
string expand_code_identifier(SSection[] sections, string identifier) { string output; output ~= "/* from "~identifier~" */"; bool identifiersMatch(SSection section) { return section.name == identifier; } SSection[] definition = find!identifiersMatch(sections); if(definition.empty) { writefln("Unable to find identifier '%s'.", identifier); return format("** %s is undefined **", identifier); } foreach(block; definition[0].contents) { // TODO:// Fix This: outputCodeContents ~= format("#line %d \"%s\"", lineNumber, inputFilename); if(block.isIdentifier) { output ~= expand_code_identifier(sections, block.content); } else { output ~= block.content; } } return output; }
Results: Back to the test code
The webpage documentation:
The D source code: