html.icn - source code

Source file html.icn
############################################################################
#
#	File:     html.icn
#
#	Subject:  Procedures for parsing HTML
#
#	Author:   Gregg M. Townsend
#
#	Date:     July 4, 2000
#
############################################################################
#
#   This file is in the public domain.
#
############################################################################
#
#	These procedures parse HTML files:
#
#	htchunks(f)	generates the basic chunks -- tags and text --
#			that compose an HTML file.
#
#	htrefs(f)	generates the tagname/keyword/value combinations
#			that reference other files.
#
#	These procedures process strings from HTML files:
#
#	httag(s)	extracts the name of a tag.
#
#	htvals(s)	generates the keyword/value pairs from a tag.
#
#	urlmerge(base,new) interprets a new URL in the context of a base.
#
############################################################################
#
#   	htchunks(f) generates the HTML chunks from file f.
#	It returns strings beginning with
#
#		<!--	for unclosed comments (legal comments are deleted)
#		<	for tags (will end with ">" unless unclosed at EOF)
#	anything else	for text
#
#	At this level entities such as &amp are left unprocessed and all
#	whitespace is preserved, including newlines.
#
############################################################################
#
#	htrefs(f) extracts file/url references from within an HTML file
#	and generates a string of the form
#		tagname keyword value
#   	for each reference.
#
#	A single space character separates the three fields, but if no
#	value is supplied for the keyword, no space follows the keyword.
#	Tag and keyword names are always returned in upper case.
#
#	Quotation marks are stripped from the value, but note that the
#	value can contain spaces or other special characters (although
#	by strict HTML rules it probably shouldn't).
#
#       A table in the code determines which fields are references to
#	other files.  For example, with <IMG>, SRC= is a reference but
#	WIDTH= is not.  The table is based on the HTML 4.0 standard:
#		http://www.w3.org/TR/REC-html40/
#
############################################################################
#
#	httag(s) extracts and returns the tag name from within an HTML
#	tag string of the form "<tagname...>".   The tag name is returned
#	in upper case.
#
############################################################################
#
#	htvals(s) generates the tag values contained within an HTML tag
#	string of the form "<tagname kw=val kw=val ...>".   For each
#	keyword=value pair beyond the tagname, a string of the form
#
#		keyword value
#
#	is generated.  One space follows the keyword, which is returned
#	in upper case, and quotation marks are stripped from the value.
#	The value itself can be an empty string.
#
#	For each keyword given without a value, the keyword is generated
#	in upper case with no following space.
#
#	Parsing is somewhat tolerant of errors.
#
############################################################################
#
#	urlmerge(base,new) interprets a full or partial new URL in the
#	context of a base URL, returning the combined URL.
#
#	Here are some examples of applying urlmerge() with a base value
#	of "http://www.vcu.edu/misc/sched.html" and a new value as given:
#
#	new		result
#	-------------	-------------------
#	#tuesday	http://www.vcu.edu/misc/sched.html#tuesday
#	bulletin.html	http://www.vcu.edu/misc/bulletin.html
#	./results.html	http://www.vcu.edu/misc/results.html
#	images/rs.gif	http://www.vcu.edu/misc/images/rs.gif
#	../		http://www.vcu.edu/
#	/greet.html	http://www.vcu.edu/greet.html
#	file:a.html	file:a.html
#
#	Path components of "./" and "../" at the beginning of the
#	new URL are handled specially to produce a simpler result.
#	No other simplifications are applied.
#
############################################################################


#   htchunks(f) -- generate HTML chunks from file f

procedure htchunks(f)			#: generate chunks of HTML file
   local prev, line, s

   "" ? repeat {

      if pos(0) then
         &subject := (read(f) || "\n") | fail

      if ="<!--" then
         suspend htc_comment(f)		# fails if comment is legal
      else if ="<" then
         suspend htc_tag(f)		# generate tag
      else
         suspend htc_text(f)		# generate text chunk

      }
end

procedure htc_tag(f)
   local s

   s := "<"
   repeat {
      if s ||:= tab(upto('>') + 1) then
         return s			# completed tag
      s ||:= tab(0)
      &subject := (read(f) || "\n") | break
      }
   return s				# unclosed tag
end

procedure htc_comment(f)
   local s

   s := ""
   repeat {
      if s ||:= tab(find("-->") + 3) then
         fail				# normal case: discard comment
      s ||:= tab(0)
      &subject := (read(f) || "\n") | break
      }

   &subject := s			# rescan unclosed comment
   return "<!--"			# return error indicator
end

procedure htc_text(f)
   local s

   s := ""
   repeat {
      if s ||:= tab(upto('<')) then
         return s
      s ||:= tab(0)
      &subject := (read(f) || "\n") | return s
      }
end


##  htrefs(f) -- generate references from HTML file f

procedure htrefs(f)			#: generate references from HTML file
   local tag, tagname, kwset, s
   static ttable
   initial {
      ttable := table()
      ttable["A"]	:= set(["HREF"])
      ttable["APPLET"]	:= set(["CODEBASE"])
      ttable["AREA"]	:= set(["HREF"])
      ttable["BASE"]	:= set(["HREF"])
      ttable["BLOCKQUOTE"] := set(["CITE"])
      ttable["BODY"]	:= set(["BACKGROUND"])
      ttable["DEL"]	:= set(["CITE"])
      ttable["FORM"]	:= set(["ACTION"])
      ttable["FRAME"]	:= set(["SRC", "LONGDESC"])
      ttable["HEAD"]	:= set(["PROFILE"])
      ttable["IFRAME"]	:= set(["SRC", "LONGDESC"])
      ttable["IMG"]	:= set(["SRC", "LONGDESC", "USEMAP"])
      ttable["INPUT"]	:= set(["SRC", "USEMAP"])
      ttable["INS"]	:= set(["CITE"])
      ttable["LINK"]	:= set(["HREF"])
      ttable["OBJECT"]	:= set(["CLASSID","CODEBASE","DATA","ARCHIVE","USEMAP"])
      ttable["Q"]	:= set(["CITE"])
      ttable["SCRIPT"]	:= set(["SRC", "FOR"])
      }

   every tag := htchunks(f) do {
      tagname := httag(tag) | next
      kwset := \ttable[tagname] | next
      every s := htvals(tag) do
         if member(kwset, s ? tab(upto(' '))) then
            suspend tagname || " " || s
      }
end



##  httag(s) -- return the name of the HTML tag s

procedure httag(s)			#: extract name of HTML tag
   static idset, wset, lcase, ucase
   initial {
      idset := &letters ++ &digits ++ '.-/'
      wset := ' \t\r\n\v\f'
      lcase := string(&lcase)
      ucase := string(&ucase)
   }

   s ? {
      ="<" | fail
      tab(many(wset))
      return map(tab(many(idset)), lcase, ucase)
   }
end



##  htvals(s) -- generate tag values of HTML tag s

procedure htvals(s)			#: generate values in HTML tag
   local kw
   static idset, wset, qset, lcase, ucase
   initial {
      idset := &letters ++ &digits ++ '.-/'
      wset := ' \t\r\n\v\f'
      qset := wset ++ '>'
      lcase := string(&lcase)
      ucase := string(&ucase)
   }

   s ? {
      ="<" | fail
      tab(many(wset))
      tab(many(idset)) | fail		# no name
      repeat {
         tab(upto(idset)) | fail
         kw := map(tab(many(idset)), lcase, ucase)
         tab(many(wset))
         if ="=" then {
            tab(many(wset))
            kw ||:= " "
            if ="\"" then {
               kw ||:= tab(upto('"') | 0)
               tab(any('"'))
               }
            else if ="'" then {
               kw ||:= tab(upto('\'') | 0)
               tab(any('\''))
               }
            else
               kw ||:= tab(upto(qset) | 0)
            }
         suspend kw
      }
   }
end



#  urlmerge(base,new) -- merge URLs

procedure urlmerge(base, new)		#: merge URLs
   local protocol, host, path
   static notslash
   initial notslash := ~'/'

   if new ? (tab(many(&letters)) & =":") then
      return new			# new is fully specified

   base ? {
      protocol := (tab(many(&letters)) || =":") | ""
      host := (="//" || tab(upto('/') | 0)) | ""
      path := tab(upto('#') | 0)
      }

   new ? {
      if ="#" then
         return protocol || host || path || new
      if ="/" then
         return protocol || host || new

      while (="." & (="/" | pos(0))) |
            (=".." & (="/" | pos(0)) & (path := url_trim(path)))

      return protocol || host || trim(path, notslash) || tab(0)
      }
end

#  url_trim(path) -- trim trailing dir provided that at least one "/" remains

procedure url_trim(path)
   static notslash
   initial notslash := ~'/'

   reverse(path) ? {			# work from back end
      tab(upto('/') + 1) | fail		# trim dir, fail if no "/"
      if =".." & (="/" | pos(0))
         then fail			# don't trim a ".." component
      path := reverse(tab(0))		# otherwise use the rest
   }
   if upto('/', path) then		# one / must remain to be valid
      return path
   else
      fail
end
This page produced by UniDoc on 2021/04/15 @ 23:59:44.