Prev:WEB 3Top:WEB 0Next:WEB 5
@*WEB 4 - Indexes and Tables of Content.
Jon Breuer - September 10, 2024.
@ I've been looking forward to a parser that can tangle code samples out of order. On the one hand it will let me talk about the interesting things first and on the other hand it could let me define a function, discuss it, then define it a second time and have the compiler pick up the updated definition.
@ Now that the parser can handle more tags, I'll be writing this file in WEB as much as possible with the |HTML| stuff kept to a minimum.
@ I really want to redefine existing sections and dynamically add content into group sections.
@ This is the source, and this is the output.
@*Existing code.
Skip Existing Code
@ The overall program layout looks like this:
@p
@
@
@
@
@
@
@>
@=
////////////
// WEB4.D
//
// This is a level 4 bootstrapping Literate Programming thing.
// It will insert indices and tables of contents. It may also allow appending or replacing sections.
//
module web3;
@>
@=
private import std.algorithm; // Needed for countUntil and searching
private import std.ascii; // Character type checks.
private import std.file; // Needed for file input and output
private import std.stdio; // Needed for error reporting and my debugging
private import std.string; // These programs are all about string processing.
@>
@ I've converted the bool isCode/isIdentifier into a consistent enum and started tracking line numbers within each text block.
@=
enum ESectionType {
CODE,
HEADER,
PARAGRAPH,
IDENTIFIER,
INDEX_TERM,
PRE, // Terms? Literal/Emphasis?
BOLD,
};
struct SSection {
string name;
ESectionType type;
SBlock[] contents;
};
struct SBlock {
ESectionType type;
int lineNumber;
string content;
};
@
@>
@=
@
@
@
@
@
@
@
@>
@ I'm inserting these hide blocks here. I wish I could insert bits of display inside a code block.
@=
ptrdiff_t countFromPosUntil(string haystack, ptrdiff_t startIndex, string needle)
{
ptrdiff_t offset = countUntil(haystack[startIndex..haystack.length], needle);
if(offset < 0) {
return offset;
}
return startIndex + offset;
}
@>
@=
string formatCodeForDisplay(string source, int lineNumber)
{
string output = "";
string scanner = escapeHTMLCharacters(source);
scanner: while(!scanner.empty) {
if(scanner.startsWith("//")) {
// Color comments.
output ~= "";
int lineLength = countUntil(scanner, "\n");
if(lineLength < 0) {
lineLength = scanner.length;
}
output ~= scanner[0..lineLength - 1];
output ~= "";
scanner = scanner[lineLength..scanner.length];
} else if(scanner.startsWith("\"") || scanner.startsWith("\'")) {
// Color strings.
char stringType = scanner[0];
output ~= "";
int stringLength = 1;
while(stringLength < scanner.length && scanner[stringLength] != stringType) {
if(scanner[stringLength] == '\\') {
stringLength += 1;
}
stringLength += 1;
}
if(stringLength >= scanner.length) {
writefln("ERROR: Unable to find close quote for string %s near line %d in string %s\n", scanner[0..min(scanner.length, 20)], lineNumber, source);
break scanner;
}
output ~= scanner[0..stringLength + 1];
output ~= "";
scanner = scanner[stringLength + 1..scanner.length];
} else {
if(isAlpha(scanner[0])) {
bool isNotIdentifier(dchar ch) { return !(isAlpha(ch) || isDigit(ch) || ch == '_'); }
int wordLength = countUntil!isNotIdentifier(scanner);
if(wordLength < 0) {
wordLength = scanner.length;
}
const string[] identifiers = [ "const", "bool", "break", "char", "dchar", "else",
"for", "if", "import", "int", "main", "module", "private", "return", "string",
"std", "void", "while", ];
if(wordLength > 0 && !findAmong(identifiers, [scanner[0..wordLength]]).empty) {
// Special identifiers
output ~= "";
output ~= scanner[0..wordLength];
output ~= "";
} else {
output ~= scanner[0..wordLength];
}
scanner = scanner[wordLength..scanner.length];
} else {
output ~= scanner[0];
scanner = scanner[1..scanner.length];
}
}
}
return output;
}
@>
@=
string escapeHTMLCharacters(string source)
{
string output;
string scanner = source;
foreach(dchar ch; source) {
if(countUntil("<>&", ch) >= 0) {
if(ch == '<') {
output ~= "<";
} else if(ch == '>') {
output ~= ">";
} else if(ch == '&') {
output ~= "&";
} else {
writefln("BUG: Only partly implemented support for '%s'.", ch);
}
} else {
output ~= ch;
}
}
return output;
}
@=
SBlock[] slurp_section(string contents, ref int offset, ref int lineNumber, bool recurse, ESectionType sectionType)
{
SBlock[] results;
string currentBlock = "";
int startLineNumber = lineNumber;
int index = offset;
for(; index < contents.length; index++) {
if(contents[index] == '@@') {
if(recurse && contents[index + 1] == '<') {
results ~= SBlock(sectionType, startLineNumber, currentBlock);
currentBlock = "";
startLineNumber = lineNumber;
@
if(contents[index..$].startsWith("@@>")) {
index += 2;
} else {
writefln("Identifier '%s' invoked without close tag. at %s", identifier, contents[index..min($, index + 10)]);
break;
}
} else if(contents[index + 1] == '@@') {
currentBlock ~= contents[index];
// Skip the escaped at symbol.
index++;
@
} else {
break;
}
}
@
else {
if(contents[index] == '\n') {
lineNumber++;
}
currentBlock ~= contents[index];
}
}
results ~= SBlock(ESectionType.CODE, startLineNumber, currentBlock);
offset = index;
return results;
}
@>
@=
string expand_code_identifier(SSection[] sections, string identifier, string inputFilename)
{
string output;
output ~= "/* from "~identifier~" */";
SSection[] definitions = find_matching_identifiers(sections, identifier);
if(definitions.empty) {
writefln("ERROR: Unable to find identifier '%s'.", identifier);
return format("ERROR: %s is undefined", identifier);
}
foreach(section; definitions) {
foreach(block; section.contents) {
output ~= format("\n#line %d \"%s\"\n", block.lineNumber, inputFilename);
if(block.type == ESectionType.IDENTIFIER) {
output ~= expand_code_identifier(sections, block.content, inputFilename);
} else {
output ~= block.content;
}
}
}
return output;
}
@>
@=
void parse_web_then_tangle_and_weave(ref string outputDisplayContents, ref string outputCodeContents, string fileContents, string inputFilename)
{
SSection[] fileSections;
int lineNumber = 0;
int charIndex = 0;
while(charIndex < fileContents.length) {
dchar ch = fileContents[charIndex];
if(ch == '@@') {
dchar chNext = charIndex < fileContents.length - 1 ? fileContents[charIndex + 1] : 0;
charIndex += 2;
if(chNext == '@@') {
// It's just an escaped at. Continue parsing.
} else if(chNext == 'p') {
fileSections ~= SSection("__main__", ESectionType.CODE, slurp_section(fileContents, charIndex, lineNumber, true, ESectionType.CODE));
} else if(chNext == '>') {
//End tag. This should be the end of this block.
} else if(chNext == '<') {
SBlock[] identifierBlocks = slurp_section(fileContents, charIndex, lineNumber, false, ESectionType.IDENTIFIER);
assert(identifierBlocks.length == 1);
string identifier = identifierBlocks[0].content;
SBlock[] sectionContents;
if(fileContents[charIndex..charIndex + 3] == "@@>=") {
charIndex += 3;
sectionContents = slurp_section(fileContents, charIndex, lineNumber, true, ESectionType.CODE);
} else {
writefln("Identifier '%s' invoked outside program and not a definition.", identifier);
}
fileSections ~= SSection(identifier, ESectionType.CODE, sectionContents);
} else if(chNext == '*') {
int titleEndingPeriod = countFromPosUntil(fileContents, charIndex, ".");
string title = "";
if(titleEndingPeriod > 0) {
title = fileContents[charIndex..titleEndingPeriod];
charIndex = titleEndingPeriod + 1;
}
fileSections ~= SSection(title, ESectionType.HEADER, slurp_section(fileContents, charIndex, lineNumber, false, ESectionType.HEADER));
} else {
// '@@ ' will be converted into a section.
fileSections ~= SSection("", ESectionType.PARAGRAPH, SBlock(ESectionType.PARAGRAPH, lineNumber, "
" ~ section.name ~"";
}
@>
@ This inserts a little javascript button to hide and show each code block.
@=
static if(false) {
outputDisplayContents ~= format(" ", section.name);
}
outputDisplayContents ~= "
";
@>
@ I want to highlight the existing blocks where I made a change.
@ I want to generate something different than |HTML| and |D|.
@*Code Scanner Bug. |Bug|! I've noticed a bug where code segments have to be separated by non-code sections. Here's the fix in the slurp section function.
@=
@
@
@
@ Save the scanner index before reading the identifier.
@=
int preIdentifierIndex = index;
index += 2;
SBlock[] identifierBlocks = slurp_section(contents, index, lineNumber, false, sectionType);
assert(identifierBlocks.length == 1);
string identifier = identifierBlocks[0].content;
@ An identifier inserted in a block will look like @@> and the definition of a new identifier will look like @@>=. If the last code section is ending because of the start of a new one, revert the identifier and allow the new section to start reading.
@=
if(contents[index..$].startsWith("@@>=")) {
// The end of one block has bumped into the start of another. Roll back.
index = preIdentifierIndex;
break;
}
@ Now that we're sure we're still in the old section, add the new identifier.
@=
// Now that we're sure this is a reference to an identifier and not a definition of a new identifier, continue.
results ~= SBlock(ESectionType.IDENTIFIER, lineNumber, identifier);
@*Generating a Table of Contents. We're going to start generating a table of contents. I think |WEB| uses ||special word|| to generate a seperate index. I've inserted a special token __table_of_contents__ to control where the TOC gets generated. Header and code sections both have titles, so I can insert them in the TOC. Headers define major sections, so I indent sub-sections below them. Luckily HTML will convert an empty list <ul></ul> into no space at all, so I can start inside an empty header and then the first header will bump us out. (Saves me tracking the start of the first header.)
@=
if(section.name == "__table_of_contents__") {
outputDisplayContents ~= "
";
continue;
}
@*__table_of_contents__.
@*Generating an Index. An index is the same thing as a TOC except the list is alphabetic. (Version 1 has duplicates here from both the definition and references.)
@=
if(section.name == "__index__") {
@
@
@
@
outputDisplayContents ~= "";
continue;
}
@=
outputDisplayContents ~= "
Index:
";
outputDisplayContents ~= "
";
@ Like the Table of Contents, we're linking to Header and Code sections. I've added tagging for Index Terms so they get added to the index as well.
@=
string[] references;
foreach(SSection indexSection; fileSections) {
if(indexSection.name.startsWith("__")) {
continue;
}
if(indexSection.type == ESectionType.HEADER || indexSection.type == ESectionType.CODE) {
references ~= indexSection.name~"@@"~indexSection.name;
}
foreach(SBlock block; indexSection.contents) {
if(block.type == ESectionType.IDENTIFIER ) {
references ~= block.content ~"@@"~indexSection.name;
}
if(block.type == ESectionType.INDEX_TERM) {
references ~= block.content.strip("|") ~"@@"~indexSection.name~block.content;
}
}
}
@=
import std.algorithm.mutation : SwapStrategy;
auto sortedReferences = sort!("a.toUpper < b.toUpper", SwapStrategy.stable) (references);
@=
string lastReference = "";
int referenceCount = 0;
foreach(string reference; sortedReferences) {
@
@
}
@ I cached the references as "block @@ section" for ease in storage and sorting. Parse them out. Then check if we've hit a new term to start a new index entry versus adding numbered subscripts.
@=
string[] components = reference.split("@@");
if(components[0] != lastReference) {
lastReference = components[0];
outputDisplayContents ~= "
" ~ components[0] ~ ":" ~ "";
referenceCount = 0;
}
outputDisplayContents ~= " " ;
@=
referenceCount++;
if(components[0] == components[1]) {
outputDisplayContents ~= "(definition) "~ "";
} else {
string indexNumber = format( "%d", referenceCount);
outputDisplayContents ~= indexNumber ~ "";
}
@*__index__.
@*Explicit Main Sections. The @@p tag is renamed __main__. Make sure there's exactly one.
@=
SSection[] mainSection = find_matching_identifiers(fileSections, "__main__");
if(mainSection.length != 1) {
writefln("ERROR: Exactly 1 __main__ section needed. %d found.", mainSection.length);
foreach(section; mainSection) {
writefln("%s found at line %d", section.name, section.contents[0].lineNumber);
}
return;
}
if(mainSection[0].type != ESectionType.CODE) {
writefln("ERROR: __main__ section needs to be code.");
return;
}
@*Better Section Management. I've wanted to append sections like includes. I don't want to call out each include, I want to add includes through the demonstration and have them accumulate at the top. For tutorial purposes, I'd like to define a section, then expand and replace it. Knuth's original WEB supports truncated identifiers. The reference may be a full sentence and the definition is truncated.
@=
SSection[] find_matching_identifiers(SSection[] sections, string identifier) {
SSection[] results;
foreach(section; sections) {
// A section might be name... or name...! or name...+. Check each form.
if(section.name.endsWith("...")) {
if(!identifier.startsWith(section.name[0..$-3])) {
continue;
}
} else if(section.name.endsWith("...!") || section.name.endsWith("...+")) {
if(!identifier.startsWith(section.name[0..$-4])) {
continue;
}
} else if(section.name != identifier) {
continue;
}
// We've found a name that matches.
if(section.name.endsWith("...+")) {
// Append tag.
} else if(section.name.endsWith("...!")) {
// Replace tag.
results = [];
} else if(!results.empty) {
writefln("WARNING: Multiple matches for '%s' found. Use ...+ to append or ...! to replace.", identifier);
}
results ~= section;
}
return results;
}
@=
// There should be 2 appended comments, 1 replaced comment, and 1 partial comment. This message will repeat.
@
@
// There should be 2 appended comments, 1 replaced comment, and 1 partial comment. This message is the repetition.
@=
// This is the original named appending
@=
// This is the first appended comment
@=
// This is the second appended comment
@=
// This is the wrong replaced comment
@=
// This is an unseen replaced comment
@=
// This is the correct replaced comment
@
@=
// Partial names appended
@ The result is something like this:
Results:
// There should be 2 appended comments, 1 replaced comment, and 1 partial comment. This message will repeat.
// This is the original named appending
// This is the first appended comment
// This is the second appended comment
// This is the correct replaced comment
// Partial names appended
// There should be 2 appended comments, 1 replaced comment, and 1 partial comment. This message is the repetition.
@* Custom Index Terms.
@ I'm tired of version 1 of each parser choking on a tag and me having to escape it either temporarily or permanently. Define tokens here.
@=
const dchar CHAR_PIPE = '|';
const dchar CHAR_AT = '@@';
const dchar CHAR_NEWLINE = '\n';
@=
else if(contents[index] == CHAR_PIPE && sectionType != ESectionType.CODE) {
if(contents[index + 1] == CHAR_PIPE) {
currentBlock ~= contents[index];
// Skip the escaped pipe symbol.
index++;
} else {
// Save the previous block.
results ~= SBlock(sectionType, startLineNumber, currentBlock);
currentBlock = "";
startLineNumber = lineNumber;
currentBlock ~= contents[index];
for(index++; index < contents.length; index++) {
currentBlock ~= contents[index];
if(contents[index] == CHAR_NEWLINE) {
lineNumber++;
}
if(contents[index] == CHAR_PIPE) {
if(index < contents.length - 1 && contents[index + 1] == CHAR_PIPE) {
// Skip the escaped pipe symbol.
index++;
} else {
break;
}
} else if(contents[index] == CHAR_AT) {
if(index < contents.length - 1 && contents[index + 1] == CHAR_AT) {
// Skip the escaped at symbol.
index++;
} else {
writefln("WARNING: At symbol encountered before end of piped index term.");
break;
}
}
}
if(index >= contents.length || contents[index] != CHAR_PIPE) {
writefln("WARNING: Close pipe expected near line %d.", startLineNumber);
break;
}
// Save the indexed identifier.
results ~= SBlock(ESectionType.INDEX_TERM, startLineNumber, currentBlock);
currentBlock = "";
startLineNumber = lineNumber;
}
}
@*Styled Text.
@ '@@ ' and '@@^' start certain font styles and '@@>' will end them. This code is inserted into slurp_section.
@ I am not sure how @^this will@> work with @.WEB3@>. A thing to test.
@ This is the next unstyled paragraph.
@=
} else if(contents[index + 1] == '^' || contents[index + 1] == '.') {
ESectionType styleType = contents[index + 1] == '^' ? ESectionType.BOLD : ESectionType.PRE;
@
@
@
@
@=
SBlock[] textBlocks = slurp_section(contents, index, lineNumber, false, sectionType);
assert(textBlocks.length == 1);
string textBlock = textBlocks[0].content;
@=
if(contents[index..$].startsWith("@@>")) {
index += 1;
} else {
writefln("Identifier '%s' invoked without close tag. at %s", textBlock, contents[index..min($, index + 10)]);
break;
}
@ An identifier inserted in a block will look like @@> and the definition of a new identifier will look like @@>=. If the last code section is ending because of the start of a new one, revert the identifier and allow the new section to start reading.
@ Now that we're sure we're still in the old section, add the new identifier.
@=
results ~= SBlock(styleType, lineNumber, textBlock);
@=
results ~= SBlock(sectionType, startLineNumber, currentBlock);
currentBlock = "";
startLineNumber = lineNumber;
index += 2;
@
TODO:
Having truncated names, I should fill out the names with the full text.
I want to highlight the existing blocks where I made a change.
I want to generate something different than HTML and D.
'@@^' is an "Roman text" index entry. - Done.
'@@.' is a code or < pre > index entry. - Done.
Fix line numbers - Done.
Truncated identifier names. Writing a sentence to describe the function is good. Repeating it is tiring. - Done.
Append could be accomplished by FindAll here. - Done.
Replace section - Done.
I think WEB uses ||special word|| => |special word| to generate a seperate index. - Done.