SGML parser library
Copyright (C) 1997 Michel Dagenais
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with this library; if not, write to the Free
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
For more information on this program, contact Michel Dagenais at
dagenais@vlsi.polymtl.ca or Electrical Eng. Dept., Ecole Polytechnique
P.O. Box 6079, Station A, Montreal, Quebec, Canada, H3C 3A7.
MODULE SGML EXPORTS SGML, SGMLRep;
IMPORT SGML, SGMLC, SGMLCScanner, Text, RefSeq, TextRefTbl, TextTextTbl, Atom,
ASCII, FileRd, TextRd, Pathname, FS, Process, OSError,
IO, Rd, FSM, Fmt, M3Config;
REVEAL
SGML.Parser = ParserPublicRep BRANDED OBJECT
options: SGML.ParserOptions;
warn: Warnings;
programName: TEXT;
files: REF ARRAY OF TEXT;
rds: REF ARRAY OF Rd.T;
e: Err;
fileNo: CARDINAL;
inhibit: BOOLEAN := TRUE;
defn: Definitions;
cache: Cache;
stateStack: RefSeq.T;
currState: StateFrame;
OVERRIDES
init := InitParser;
run := RunParser;
halt := HaltParser;
inhibitMessages := InhibitMessages;
subdocumentParser := SubdocumentParser;
newParser := NewParser;
END;
SGML.Application = SGML.ApplicationPublic BRANDED OBJECT
parser: SGML.Parser;
OVERRIDES
getDetailedLocation := GetDetailedLocation;
init := ApplicationInit;
END;
The parser maintains tables of defined entities (parameter, general
and doctype), of public names for some of these entities,
and of elements. Each element type has a finite state machine describing
the content it accepts. Since elements may nest, the current state for
the finite state machines of all currently nested elements must be kept
in a stack of state frames.
StateFrame = BRANDED REF RECORD
name: Atom.T;
omitE: BOOLEAN;
currNode: FSM.Node;
fsm: FSM.T;
END;
TYPE
Definitions = RECORD
elements: TextRefTbl.Default;
attlists: TextRefTbl.Default;
notations: TextRefTbl.Default;
pEntities: TextRefTbl.Default;
gEntities: TextRefTbl.Default;
dEntities: TextRefTbl.Default;
publics: TextTextTbl.Default;
END;
Cache = REF RECORD
catalog: Definitions;
dtd: TextRefTbl.Default;
END;
Err = SGMLC.ErrHandler BRANDED OBJECT
num: CARDINAL := 0;
application: SGML.Application;
OVERRIDES
error := GenError
END;
EntityResolver = SGMLCScanner.EntityResolver OBJECT
parser: SGML.Parser;
OVERRIDES
resolve := ResolveParameterEntity;
END;
The error handler and entity resolver allow the parser and scanner to
report errors and query for entities.
Warnings = RECORD
mixed, sgmldecl, should, default, duplicate, undefined, unclosed,
empty, net, unusedParam, notationSysid: BOOLEAN := FALSE;
END;
VAR
StartAtom := Atom.FromText("!INIT");
Report the error to the application.
PROCEDURE GenError(e : Err ; <*UNUSED*>line : CARDINAL ;
<*UNUSED*>col : CARDINAL ; msg : TEXT) =
BEGIN
INC(e.num);
e.application.error(SGML.ErrorEvent{e.application.parser.p.offset(),
SGML.ErrorType.OtherError,msg});
END GenError ;
Convert a string of 0-9 digits to the associated character.
PROCEDURE CharRefToCode(t: TEXT; VAR c: SGML.CharCode): BOOLEAN =
VAR
car: CHAR;
val: CARDINAL;
length := Text.Length(t);
BEGIN
IF length < 3 THEN RETURN FALSE; END;
IF Text.Equal(t,"&") THEN
c := ORD('&');
RETURN TRUE;
ELSIF Text.GetChar(t,0) = '&' AND Text.GetChar(t,1) = '#' AND
Text.GetChar(t,length - 1) = ';' THEN
val := 0;
IF Text.GetChar(t,2) = 'x' OR Text.GetChar(t,2) = 'X' THEN
FOR i := 3 TO Text.Length(t) - 2 DO
car := Text.GetChar(t,i);
IF car IN ASCII.Digits THEN
val := (16 * val) + (ORD(car) - ORD('0'));
ELSE
car := ASCII.Upper[car];
IF car < 'A' OR car > 'F' THEN RETURN FALSE; END;
val := (16 * val) + 10 + (ORD(car) - ORD('A'));
END;
IF val >= LAST(SGML.CharCode) THEN RETURN FALSE; END;
END;
ELSE
FOR i := 2 TO Text.Length(t) - 2 DO
val := (10 * val) + (ORD(Text.GetChar(t,i)) - ORD('0'));
IF val >= LAST(SGML.CharCode) THEN RETURN FALSE; END;
END;
END;
c := val;
RETURN TRUE;
ELSE
RETURN FALSE;
END;
END CharRefToCode;
Allocate all the parser tables. Provide default values.
PROCEDURE InitParser(self: SGML.Parser;
options: SGML.ParserOptions; <*NOWARN*>
programName: TEXT; files: REF ARRAY OF TEXT;
rds: REF ARRAY OF Rd.T := NIL): SGML.Parser =
BEGIN
self.options := options;
CopyTextArray(self.options.addCatalog,"catalog");
CopyTextArray(self.options.includeParam);
CopyTextArray(self.options.enableWarning);
InitWarnings(self);
CopyTextArray(self.options.addSearchDir,M3Config.PKG_USE &
M3Config.PATH_SEP & "sgml" & M3Config.PATH_SEP & "src" &
M3Config.PATH_SEP & "dtd");
CopyTextArray(self.options.activateLink);
CopyTextArray(self.options.architecture);
self.programName := programName;
self.attributes := NEW(REF ARRAY OF SGML.Attribute, 15);
self.dataChunks := NEW(REF ARRAY OF SGML.CdataChunk, 15);
self.stateStack := NEW(RefSeq.T);
self.files := NEW(REF ARRAY OF TEXT,NUMBER(files^));
self.files^ := files^;
self.rds := rds;
self.e := NEW(Err);
self.s := NEW(SGMLC.Scanner);
self.p := NEW(ParserPlus, parser := self);
self.cache := NEW(Cache, dtd := NIL);
RETURN self;
END InitParser;
PROCEDURE NewParser(self: SGML.Parser; files: REF ARRAY OF TEXT;
rds: REF ARRAY OF Rd.T := NIL): SGML.Parser =
VAR
new := NEW(SGML.Parser);
BEGIN
new.options := self.options;
new.warn := self.warn;
new.programName := self.programName;
new.attributes := NEW(REF ARRAY OF SGML.Attribute, 15);
new.dataChunks := NEW(REF ARRAY OF SGML.CdataChunk, 15);
new.stateStack := NEW(RefSeq.T);
new.files := NEW(REF ARRAY OF TEXT,NUMBER(files^));
new.files^ := files^;
new.rds := rds;
new.e := NEW(Err);
new.s := NEW(SGMLC.Scanner);
new.p := NEW(ParserPlus, parser := new);
new.cache := self.cache;
RETURN new;
END NewParser;
PROCEDURE InitWarnings(self: SGML.Parser) =
BEGIN
FOR i := 0 TO LAST(self.options.enableWarning^) DO
WITH w = self.options.enableWarning[i] DO
IF Text.Equal(w,"mixed") THEN self.warn.mixed := TRUE;
ELSIF Text.Equal(w,"sgmldecl") THEN self.warn.sgmldecl := TRUE;
ELSIF Text.Equal(w,"should") THEN self.warn.should := TRUE;
ELSIF Text.Equal(w,"default") THEN self.warn.default := TRUE;
ELSIF Text.Equal(w,"duplicate") THEN self.warn.duplicate := TRUE;
ELSIF Text.Equal(w,"undefined") THEN self.warn.undefined := TRUE;
ELSIF Text.Equal(w,"empty") THEN self.warn.empty := TRUE;
ELSIF Text.Equal(w,"net") THEN self.warn.net := TRUE;
ELSIF Text.Equal(w,"unused-param") THEN self.warn.unusedParam := TRUE;
ELSIF Text.Equal(w,"notation-sysid") THEN
self.warn.notationSysid := TRUE;
ELSIF Text.Equal(w,"min-tag") THEN
self.warn.unclosed := TRUE;
self.warn.empty := TRUE;
self.warn.net := TRUE;
ELSIF Text.Equal(w,"all") THEN
self.warn := Warnings{TRUE,TRUE,TRUE,TRUE,TRUE,TRUE,TRUE,TRUE,TRUE,
TRUE,TRUE};
END;
END;
END;
END InitWarnings;
Initialize all the parser tables. Start parsing calling back the
application for each significant chunk parsed.
PROCEDURE RunParser(self: SGML.Parser; a: SGML.Application): CARDINAL =
VAR
rd: Rd.T;
BEGIN
self.e.num := 0;
self.e.application := a;
self.application := a;
a.parser := self;
(* Once for all parsers using the same options (paths and catalogs)
we need to fill the definitions with the catalogs content. The
few builtin entities are inserted as well. Then, the resulting set
of definitions is saved in the cache, to be reused for all files
within the same parser and for all parsers sharing the same options
(obtained with newParser()). *)
IF self.cache.dtd = NIL THEN
(* Start with clean tables for the definitions. *)
self.defn.elements := NEW(TextRefTbl.Default).init();
self.defn.attlists := NEW(TextRefTbl.Default).init();
self.defn.notations := NEW(TextRefTbl.Default).init();
self.defn.pEntities := NEW(TextRefTbl.Default).init();
self.defn.gEntities := NEW(TextRefTbl.Default).init();
self.defn.dEntities := NEW(TextRefTbl.Default).init();
self.defn.publics := NEW(TextTextTbl.Default).init();
(* Initialize the builtin entities *)
AddElement(self,"#PCDATA",NIL,NIL,FALSE,TRUE);
AddEntity(self,NEW(REF SGML.Entity, name := "amp",
internalText := "&",
dataType := SGML.EntityDataType.Sgml,
declType := SGML.EntityDeclType.General));
AddEntity(self,NEW(REF SGML.Entity, name := "gt",
internalText := ">",
dataType := SGML.EntityDataType.Sgml,
declType := SGML.EntityDeclType.General));
AddEntity(self,NEW(REF SGML.Entity,
name := "lt", internalText := "<",
dataType := SGML.EntityDataType.Sgml,
declType := SGML.EntityDeclType.General));
AddEntity(self,NEW(REF SGML.Entity, name := "quot",
internalText := "\"", dataType := SGML.EntityDataType.Sgml,
declType := SGML.EntityDeclType.General));
(* Parse the catalog files. *)
self.s := self.s.initSimple(self.e);
self.s.pushState(SGMLCScanner.State.Catalog);
FOR i := 0 TO LAST(self.options.addCatalog^) DO
TRY
ParseCatalog(self,self.options.addCatalog[i],
Process.GetWorkingDirectory());
EXCEPT ELSE
a.error(SGML.ErrorEvent{0,SGML.ErrorType.OtherError,
"Error processing " & self.options.addCatalog[i]});
END;
END;
self.s.popState();
(* Initialize the cache and save the definitions in it. *)
self.cache.dtd := NEW(TextRefTbl.Default).init();
CopyDefinitions(self.defn,self.cache.catalog);
END;
(* For each file, start with the catalogs definitions saved in the
cache and parse the file. *)
FOR i := 0 TO LAST(self.files^) DO
self.fileNo := i;
TRY
CopyDefinitions(self.cache.catalog,self.defn);
self.stateStack := self.stateStack.init();
self.currState := NIL;
self.doctype := NIL;
self.markup := FALSE;
self.s := self.s.initSimple(self.e);
IF self.rds = NIL THEN rd := FileRd.Open(self.files[i]);
ELSE rd := self.rds[i];
END;
self.s.pushFile(self.files[i],rd);
self.s.setEntityResolver(NEW(EntityResolver, parser := self));
self.p := self.p.init(self.s, self.e);
self.p.parse();
EXCEPT
ELSE
a.error(SGML.ErrorEvent{0,SGML.ErrorType.OtherError,
"Cannot open " & self.files[i]});
END;
END;
RETURN self.e.num;
END RunParser;
Parse another file in the current context. The same parser state
is kept (entities, elements...).
PROCEDURE SubParse(self: SGML.Parser; name: TEXT; rd: Rd.T) =
VAR
oldS: SGMLC.Scanner;
oldP: SGMLC.Parser;
BEGIN
oldS := self.s;
oldP := self.p;
self.s := NEW(SGMLC.Scanner);
self.p := NEW(ParserPlus, parser := self);
self.s := self.s.initSimple(self.e);
self.s.inMarkupDecl(self.markup);
self.s.pushFile(name,rd);
self.s.setEntityResolver(NEW(EntityResolver, parser := self));
self.p := self.p.init(self.s, self.e);
TRY self.p.parse(); EXCEPT ELSE END;
self.s := oldS;
self.p := oldP;
END SubParse;
The doctype tells which Dtd to use. The Dtd is parsed as a subfile.
PROCEDURE ParseDtd(parser: SGML.Parser; externalId: SGML.ExternalId) =
VAR
rd: Rd.T;
dtdId: TEXT;
tmp: REFANY;
defn: REF Definitions;
BEGIN
(* When a default document type exists, use it. Otherwise complain and
use HTML. *)
IF parser.doctype = NIL THEN
parser.doctype := parser.options.defaultDoctype;
IF parser.doctype = NIL THEN
parser.doctype := "HTML";
parser.application.error(SGML.ErrorEvent{parser.p.offset(),SGML.
ErrorType.Info, "Document Type not declared, HTML assumed"});
END;
END;
parser.application.startDtd(SGML.StartDtdEvent{parser.p.offset(),
parser.doctype,externalId});
(* Some Dtd are fairly long to parse. The corresponding definitions
are saved in the cache and reused the second time a Dtd
is requested. The key to uniquely identity Dtds is the system ID. *)
dtdId := GetEntity(parser,parser.doctype,SGML.EntityDeclType.Doctype).
externalId.generatedSystemId;
IF parser.cache.dtd.get(dtdId,tmp) THEN
defn := NARROW(tmp,REF Definitions);
CopyDefinitions(defn^,parser.defn);
ELSE
rd := ResolveEntity(parser,parser.doctype,SGML.EntityDeclType.Doctype);
IF rd = NIL THEN
parser.application.error(SGML.ErrorEvent{0,SGML.ErrorType.
OtherError,"Cannot open Dtd for DocType " & parser.doctype});
ELSE
parser.markup := TRUE;
SubParse(parser,parser.doctype,rd);
parser.markup := FALSE;
defn := NEW(REF Definitions);
CopyDefinitions(parser.defn,defn^);
EVAL parser.cache.dtd.put(dtdId,defn);
END;
END;
parser.application.endDtd(SGML.EndDtdEvent{parser.p.offset(),
parser.doctype});
END ParseDtd;
Not implemented yet. Awaiting requests for it!
PROCEDURE HaltParser(<*UNUSED*>self: SGML.Parser) =
BEGIN
<* ASSERT FALSE *> (* Perhaps force EOF into the scanner... *)
END HaltParser;
PROCEDURE InhibitMessages(self: SGML.Parser; inhibit: BOOLEAN) =
BEGIN
self.inhibit := inhibit;
END InhibitMessages;
Not implemented yet. Awaiting requests for it.
PROCEDURE SubdocumentParser(self: SGML.Parser; <*UNUSED*>systemId: TEXT):
SGML.Parser =
BEGIN
<*ASSERT FALSE*>
RETURN self; <*NOWARN*>
END SubdocumentParser;
PROCEDURE ApplicationInit(self: SGML.Application): SGML.Application =
BEGIN
RETURN self;
END ApplicationInit;
Unfortunately the detailed location does not carry all the information.
It allows for one full set of data within a file plus an entity name
and offset. What we have in reality is a stack of entities when
nested parameter entities are encountered.
PROCEDURE GetDetailedLocation(self: SGML.Application;
<*UNUSED*>pos: SGML.Position): SGML.DetailedLocation =
VAR
p := self.parser.p;
s := self.parser.s;
input: SGMLCScanner.Input;
nl: SGML.DetailedLocation;
BEGIN
IF s.inputStack.size() = 0 THEN
nl.byteOffset := p.offset();
nl.lineNumber := p.line();
nl.columnNumber := p.column();
nl.entityOffset := 0;
nl.entityName := NIL;
nl.filename := s.input.name;
ELSE
input := s.inputStack.get(0);
nl.lineNumber := input.currentLine;
nl.columnNumber := input.currentCol;
nl.byteOffset := input.offset;
nl.entityOffset := p.offset();
nl.entityName := s.input.name;
nl.filename := input.name;
END;
RETURN nl;
END GetDetailedLocation;
We are reusing the same scanner for catalogs as for SGML files.
PROCEDURE ParseCatalog(parser: SGML.Parser; name, dir: TEXT) =
VAR
ss: SGMLC.ScanSymbol;
publicId, systemId: TEXT;
e: REF SGML.Entity;
BEGIN
PushCatalogFile(parser,name,dir);
TRY
LOOP
parser.s.get(ss);
CASE ss.sym OF
| SGMLC.Symbol.Eof =>
EXIT;
| SGMLC.Symbol.CATALOGkw =>
parser.s.get(ss);
PushCatalogFile(parser,ss.string,
Pathname.Prefix(parser.s.input.name));
| SGMLC.Symbol.PUBLICkw =>
parser.s.get(ss);
publicId := ss.string;
parser.s.get(ss);
systemId := ss.string;
EVAL parser.defn.publics.put(publicId, systemId);
| SGMLC.Symbol.SGMLDECLkw =>
parser.s.get(ss);
| SGMLC.Symbol.DOCTYPEkw =>
e := NEW(REF SGML.Entity);
parser.s.get(ss);
e.name := ss.string;
e.declType := SGML.EntityDeclType.Doctype;
e.dataType := SGML.EntityDataType.Sgml;
e.internalText := NIL;
parser.s.get(ss);
e.externalId := SGML.ExternalId{ss.string,NIL,NIL};
AddEntity(parser,e);
ELSE
IF ss.string = NIL THEN ss.string := ""; END;
parser.application.error(SGML.ErrorEvent{0,
SGML.ErrorType.OtherError, "Invalid catalog entry " &
ss.string});
END;
END;
EXCEPT ELSE
parser.application.error(SGML.ErrorEvent{0,
SGML.ErrorType.OtherError, "Premature end of file in catalog"});
END;
END ParseCatalog;
Find the catalog file along the available paths.
PROCEDURE PushCatalogFile(parser: SGML.Parser; name, dir: TEXT) =
VAR
fullName: TEXT;
rd: Rd.T;
BEGIN
fullName := FindFile(name,dir,parser.options.addSearchDir);
rd := NIL;
IF fullName # NIL THEN rd := IO.OpenRead(fullName);
ELSE fullName := name;
END;
IF rd = NIL THEN
parser.application.error(SGML.ErrorEvent{0,SGML.ErrorType.OtherError,
"Cannot open catalog " & fullName});
ELSE
parser.s.pushFile(fullName, rd);
END;
END PushCatalogFile;
Insert a new element in the table of defined elements. There may
already be an entry for this element holding the description of its
attributes.
PROCEDURE AddElement(parser: SGML.Parser; name: TEXT; content: REFANY;
m: FSM.T; omitS, omitE: BOOLEAN) =
VAR
tmp: REFANY;
BEGIN
IF parser.defn.elements.get(name,tmp) AND parser.warn.duplicate THEN
parser.application.error(SGML.ErrorEvent{0,SGML.ErrorType.OtherError,
"Redefinition of element " & name});
ELSE
EVAL parser.defn.elements.put(name,NEW(ElementDesc, name := name,
content := content, fsm := m, omitS := omitS, omitE := omitE));
END;
END AddElement;
Add the specified attributes to the list for the element.
PROCEDURE AddAttributes(parser: SGML.Parser; name: TEXT;
attributes: TextRefTbl.T) =
VAR
tmp: REFANY;
iter: TextRefTbl.Iterator;
t: TEXT;
attlist: TextRefTbl.T;
BEGIN
IF parser.defn.attlists.get(name,tmp) THEN
attlist := NARROW(tmp,TextRefTbl.T);
iter := attlist.iterate();
WHILE iter.next(t,tmp) DO
EVAL attributes.put(t,tmp);
END;
END;
EVAL parser.defn.attlists.put(name,attributes);
END AddAttributes;
Add the entity into the appropriate table (general, parameter, doctype).
For external entities, any missing components (system id, generated
system id) are computed using the publics table and the search paths.
PROCEDURE AddEntity(p: SGML.Parser; e: REF SGML.Entity) =
VAR
exist: BOOLEAN;
BEGIN
IF e.externalId.systemId = NIL AND e.externalId.publicId # NIL THEN
EVAL p.defn.publics.get(e.externalId.publicId,e.externalId.systemId);
END;
IF e.externalId.systemId # NIL THEN
e.externalId.generatedSystemId := FindFile(e.externalId.systemId,
Pathname.Prefix(p.s.input.name),p.options.addSearchDir);
END;
CASE e.declType OF
| SGML.EntityDeclType.General => exist := p.defn.gEntities.put(e.name,e);
| SGML.EntityDeclType.Parameter => exist := p.defn.pEntities.put(e.name,e);
| SGML.EntityDeclType.Doctype => exist := p.defn.dEntities.put(e.name,e);
ELSE <* ASSERT FALSE *>
END;
IF exist AND p.warn.duplicate THEN
p.application.error(SGML.ErrorEvent{0,SGML.ErrorType.OtherError,
"Redefinition of entity " & e.name});
END;
END AddEntity;
Add the notation declaration in the table.
PROCEDURE AddNotation(parser: SGML.Parser; n: REF SGML.Notation) =
BEGIN
EVAL parser.defn.notations.put(n.name,n);
END AddNotation;
Check that the DataChunk buffer is large enough.
PROCEDURE IncData(p: SGML.Parser) =
BEGIN
INC(p.nbData);
IF p.nbData > LAST(p.dataChunks^) THEN
WITH new = NEW(REF ARRAY OF SGML.CdataChunk,NUMBER(p.dataChunks^) * 2) DO
SUBARRAY(new^,0,NUMBER(p.dataChunks^)) := p.dataChunks^;
p.dataChunks := new;
END;
END;
END IncData;
Check that the attribute buffer is large enough.
PROCEDURE IncAttr(p: SGML.Parser) =
BEGIN
INC(p.nbAttr);
IF p.nbAttr > LAST(p.attributes^) THEN
WITH new = NEW(REF ARRAY OF SGML.Attribute,NUMBER(p.attributes^) * 2) DO
SUBARRAY(new^,0,NUMBER(p.attributes^)) := p.attributes^;
p.attributes := new;
END;
END;
END IncAttr;
Find a parameter entity from the table.
PROCEDURE ResolveParameterEntity(self: EntityResolver; name: TEXT): Rd.T =
BEGIN
RETURN ResolveEntity(self.parser,name,SGML.EntityDeclType.Parameter);
END ResolveParameterEntity;
Find an entity of the specified type from the corresponding table and
open a reader for it.
PROCEDURE ResolveEntity(self: SGML.Parser; name: TEXT;
type: SGML.EntityDeclType): Rd.T =
VAR
e: REF SGML.Entity;
BEGIN
e := GetEntity(self,name,type);
IF e # NIL THEN
IF e.internalText # NIL THEN RETURN TextRd.New(e.internalText); END;
IF e.externalId.generatedSystemId = NIL THEN RETURN NIL; END;
RETURN IO.OpenRead(e.externalId.generatedSystemId);
ELSE
RETURN NIL;
END;
END ResolveEntity;
PROCEDURE GetEntity(self: SGML.Parser; name: TEXT; type: SGML.EntityDeclType):
REF SGML.Entity =
VAR
tmp: REFANY;
found: BOOLEAN;
BEGIN
CASE type OF
| SGML.EntityDeclType.General =>
found := self.defn.gEntities.get(name,tmp);
| SGML.EntityDeclType.Parameter =>
found := self.defn.pEntities.get(name,tmp);
| SGML.EntityDeclType.Doctype =>
found := self.defn.dEntities.get(name,tmp);
ELSE <* ASSERT FALSE *>
END;
IF found THEN RETURN NARROW(tmp, REF SGML.Entity);
ELSE RETURN NIL;
END;
END GetEntity;
Look for a file along the search paths or in the current directory.
PROCEDURE FindFile(name, cwd: TEXT; dirs: REF ARRAY OF TEXT): TEXT =
VAR
absName: TEXT;
BEGIN
IF Pathname.Absolute(name) THEN RETURN name; END;
TRY
absName := Pathname.Join(cwd,name,NIL);
EVAL FS.Status(absName);
RETURN absName;
EXCEPT OSError.E => END;
FOR i := 0 TO LAST(dirs^) DO
TRY
absName := Pathname.Join(dirs[i],name,NIL);
EVAL FS.Status(absName);
RETURN absName;
EXCEPT OSError.E => END;
END;
RETURN NIL;
END FindFile;
Initialize the stack of FSM states which keep track of what was received
in which nested element. To start, we assume we are in an element (!INIT)
which expects a single element named after the document type.
PROCEDURE StartContent(parser: SGML.Parser) =
VAR
fsm: FSM.T;
BEGIN
FSM.New(fsm,Atom.FromText(parser.doctype));
FSM.Wrap(fsm); <*NOWARN*> (* This FSM is valid *)
parser.currState := NEW(StateFrame, name := StartAtom, omitE := FALSE,
currNode := FSM.StartNode(fsm), fsm := fsm);
parser.pcdata := FALSE;
END StartContent;
This is the end. Any currently opened element should allow an
end tag next, and the end tag should be optional.
PROCEDURE ExplainNode(fsm: FSM.T; id: CARDINAL; edges: REF ARRAY OF FSM.Edge;
else, skip: FSM.Node): TEXT =
VAR
t: TEXT;
BEGIN
t := "Current state is node " & Fmt.Unsigned(id) &
" with possible transitions:\n";
FOR i := 0 TO LAST(edges^) DO
t := t & " event " & Atom.ToText(edges[i].label) & " leads to node " &
Fmt.Unsigned(FSM.NodeId(fsm,edges[i].destination)) & "\n";
END;
IF else # NIL THEN
t := t & " else leads to node " & Fmt.Unsigned(FSM.NodeId(fsm,else)) &
"\n";
END;
IF skip # NIL THEN
t := t & " skip leads to node " & Fmt.Unsigned(FSM.NodeId(fsm,skip)) &
"\n";
END;
RETURN t;
END ExplainNode;
PROCEDURE ExplainFSMError(parser: SGML.Parser; error: SGML.ErrorType;
t: TEXT) =
VAR
explanation := "\n";
id: CARDINAL;
skip, else: FSM.Node;
edges: REF ARRAY OF FSM.Edge;
fsm := parser.currState.fsm;
node := parser.currState.currNode;
BEGIN
(* This is a bit verbose, output only when warnings enabled *)
IF parser.warn.unclosed THEN
WHILE node # NIL DO
FSM.NodeContent(fsm, node, id, edges, else, skip);
explanation := explanation & ExplainNode(fsm, id, edges, else, skip);
node := skip;
END;
END;
parser.application.error(SGML.ErrorEvent{0,error,t & explanation});
END ExplainFSMError;
PROCEDURE EndContent(parser: SGML.Parser) =
BEGIN
LOOP
IF parser.stateStack.size() = 0 THEN RETURN; END;
IF NOT(FSM.Exit(parser.currState.currNode) AND
parser.currState.omitE) THEN
ExplainFSMError(parser,SGML.ErrorType.OtherError,
"Premature end of document missing end tag: " &
Atom.ToText(parser.currState.name));
END;
IF parser.warn.unclosed THEN
parser.application.error(SGML.ErrorEvent{0,SGML.ErrorType.Warning,
"Omitted </" & Atom.ToText(parser.currState.name) & ">"});
END;
parser.application.endElement(SGML.EndElementEvent{parser.p.offset(),
Atom.ToText(parser.currState.name)});
parser.currState := parser.stateStack.remhi();
END;
END EndContent;
A new element starts.
PROCEDURE StartElement(parser: SGML.Parser;
READONLY e: SGML.StartElementEvent) =
BEGIN
parser.pcdata := FALSE;
StartElement2(parser,e);
END StartElement;
PROCEDURE StartData(parser: SGML.Parser) =
BEGIN
IF NOT parser.pcdata THEN
StartElement2(parser,SGML.StartElementEvent{0,"#PCDATA",
SGML.ElementContentType.Empty,FALSE,NIL});
END;
END StartData;
PROCEDURE StartElement2(parser: SGML.Parser;
READONLY e: SGML.StartElementEvent) =
VAR
nextNode: FSM.Node;
name := Atom.FromText(e.gi);
empty := e.contentType = SGML.ElementContentType.Empty;
element: ElementDesc;
tmp: REFANY;
expect: Atom.T;
ne: SGML.StartElementEvent;
BEGIN
LOOP
(* The element is accepted within the current element. *)
IF FSM.Enter(parser.currState.currNode,name,nextNode) THEN
IF NOT parser.defn.elements.get(e.gi,tmp) THEN
parser.application.error(SGML.ErrorEvent{0,SGML.ErrorType.
OtherError,"Unknown element " & e.gi});
RETURN;
END;
parser.currState.currNode := nextNode;
element := tmp;
IF empty THEN RETURN; END;
parser.stateStack.addhi(parser.currState);
parser.currState := NEW(StateFrame, name := name,
omitE := element.omitE, currNode := FSM.StartNode(element.fsm),
fsm := element.fsm);
RETURN;
(* Perhaps the current element should end (end tag allowed next and is
optional). Then the new element may become allowed. *)
ELSIF FSM.Exit(parser.currState.currNode) AND parser.currState.omitE AND
parser.stateStack.size() > 1 THEN
IF parser.warn.unclosed THEN
parser.application.error(SGML.ErrorEvent{0,SGML.ErrorType.Warning,
"Omitted </" & Atom.ToText(parser.currState.name) & ">"});
END;
parser.application.endElement(SGML.EndElementEvent{parser.p.offset(),
Atom.ToText(parser.currState.name)});
parser.currState := parser.stateStack.remhi();
(* Perhaps a single type of element is accepted here and its start
tag is optional, in which case it is provided. Then the new element
may become allowed. *)
ELSE
expect := FSM.Expect(parser.currState.currNode);
IF expect # NIL AND parser.defn.elements.get(Atom.ToText(expect),tmp)
AND NARROW(tmp,ElementDesc).omitS THEN
EVAL FSM.Enter(parser.currState.currNode,expect,nextNode);
parser.currState.currNode := nextNode;
element := tmp;
ne.pos := parser.p.offset();
ne.gi := element.name;
IF element.content # EmptyAtom THEN
ne.contentType := SGML.ElementContentType.Mixed;
ELSE
ne.contentType := SGML.ElementContentType.Empty;
END;
ne.included := FALSE;
ne.attributes := NEW(REF ARRAY OF SGML.Attribute, 0);
IF parser.warn.unclosed THEN
parser.application.error(SGML.ErrorEvent{0,SGML.ErrorType.Warning,
"Omitted <" & ne.gi & ">"});
END;
parser.application.startElement(ne);
parser.stateStack.addhi(parser.currState);
parser.currState := NEW(StateFrame, name := expect,
omitE := element.omitE, currNode := FSM.StartNode(element.fsm),
fsm := element.fsm);
ELSE
ExplainFSMError(parser,SGML.ErrorType.OtherError,
"Misplaced tag " & e.gi & " within element " &
Atom.ToText(parser.currState.name));
RETURN;
END;
END;
END;
END StartElement2;
End element received.
PROCEDURE EndElement(parser: SGML.Parser; n: TEXT) =
VAR
name := Atom.FromText(n);
mayExit := FSM.Exit(parser.currState.currNode);
ne: SGML.StartElementEvent;
BEGIN
parser.pcdata := FALSE;
LOOP
(* Correct end tag received. *)
IF parser.currState.name = name THEN
(* The element content is not complete! *)
IF NOT mayExit THEN
ExplainFSMError(parser,SGML.ErrorType.OtherError,
"Premature end tag " & n);
END;
parser.currState := parser.stateStack.remhi();
RETURN;
(* Incorrect end tag. Perhaps an indication of missing omitted
end tags. Check if the current element is complete and its
end tag optional. If so, generate the omitted end tag and
retry at the next level, in the loop. *)
ELSIF mayExit AND parser.currState.omitE AND
parser.stateStack.size() > 1 THEN
IF parser.warn.unclosed THEN
parser.application.error(SGML.ErrorEvent{0,SGML.ErrorType.Warning,
"Omitted </" & Atom.ToText(parser.currState.name) & ">"});
END;
parser.application.endElement(SGML.EndElementEvent{parser.p.offset(),
Atom.ToText(parser.currState.name)});
parser.currState := parser.stateStack.remhi();
(* This end tag just should not be there. Create a dummy start tag
just to keep these balanced. *)
ELSE
ExplainFSMError(parser,SGML.ErrorType.OtherError,
"Misplaced end tag " & n & " within element " &
Atom.ToText(parser.currState.name));
ne.pos := parser.p.offset();
ne.gi := n;
ne.contentType := SGML.ElementContentType.Empty;
ne.included := FALSE;
ne.attributes := NEW(REF ARRAY OF SGML.Attribute, 0);
parser.application.startElement(ne);
RETURN;
END;
END;
END EndElement;
VAR
emptyArray := NEW(REF ARRAY OF TEXT,0);
Copy a text array, use an empty array if NIL, and append add1
if non NIL. Used to copy optional parsing options, merged with a
possible default value add1.
PROCEDURE CopyTextArray(VAR a: REF ARRAY OF TEXT; add1: TEXT := NIL) =
VAR
newArray: REF ARRAY OF TEXT;
size: CARDINAL;
BEGIN
IF a = NIL THEN a := emptyArray; END;
size := NUMBER(a^);
IF add1 # NIL THEN INC(size); END;
newArray := NEW(REF ARRAY OF TEXT,size);
SUBARRAY(newArray^,0,NUMBER(a^)) := a^;
IF add1 # NIL THEN newArray[LAST(newArray^)] := add1; END;
a := newArray;
END CopyTextArray;
PROCEDURE CopyTextRefTbl(tbl: TextRefTbl.T): TextRefTbl.Default =
VAR
new := NEW(TextRefTbl.Default).init();
iter := tbl.iterate();
k: TEXT;
v: REFANY;
BEGIN
WHILE iter.next(k,v) DO EVAL new.put(k,v); END;
RETURN new;
END CopyTextRefTbl;
PROCEDURE CopyTextTextTbl(tbl: TextTextTbl.T): TextTextTbl.Default =
VAR
new := NEW(TextTextTbl.Default).init();
iter := tbl.iterate();
k, v: TEXT;
BEGIN
WHILE iter.next(k,v) DO EVAL new.put(k,v); END;
RETURN new;
END CopyTextTextTbl;
PROCEDURE CopyDefinitions(READONLY from: Definitions; VAR to: Definitions) =
BEGIN
to.elements := CopyTextRefTbl(from.elements);
to.attlists := CopyTextRefTbl(from.attlists);
to.notations := CopyTextRefTbl(from.notations);
to.pEntities := CopyTextRefTbl(from.pEntities);
to.gEntities := CopyTextRefTbl(from.gEntities);
to.dEntities := CopyTextRefTbl(from.dEntities);
to.publics := CopyTextTextTbl(from.publics);
END CopyDefinitions;
BEGIN
(* Preallocated values to indicate the type of marked section. *)
CDataParam := NEW(REF ARRAY OF SGML.MarkedSectionParam, 1);
IncludeParam := NEW(REF ARRAY OF SGML.MarkedSectionParam, 1);
IgnoreParam := NEW(REF ARRAY OF SGML.MarkedSectionParam, 1);
CDataParam[0].type := SGML.MarkedSectionParamType.CData;
IncludeParam[0].type := SGML.MarkedSectionParamType.Include;
IgnoreParam[0].type := SGML.MarkedSectionParamType.Ignore;
CDataParam[0].entityName := "CData";
IncludeParam[0].entityName := "Include";
IgnoreParam[0].entityName := "Ignore";
PCDataAtom := Atom.FromText("#PCDATA");
EmptyAtom := Atom.FromText("EMPTY");
END SGML.