idxGen.icn - source code

Source file idxGen.icn
################################################################################
#
#   idxGen   Take a file of index terms and put them in the appropriate places
#            in a set of LaTeX files. The original files are not altered:
#            either new files are produced, or output is made to &output
#
#   Don Ward   March 2017
#           10 March 2020  Do all index hits in a para, not just the first.
#                          Float index references to the top of iconline{}.
#
#-------------------------------------------------------------------------------
# The big picture is the program analyses the files one paragraph at a time. For
# each paragraph, it makes a table of all the words together with where they occur
# (line number and ordinal position). It then places index hits before the line
# where they occur. An index term, in its full glory, can be something like
#
#      (V|v)irtual( |-)machine(|s)+3+performance:Virtual Machine!Performance
#
# which will place an "\index{Virtual Machine!Performance}" command before the
# first line in which any of the words
#      Virtual-machine, virtual-machine, Virtual-machines, virtual-machines
# or the two word phrases
#     Virtual machine, virtual machine, Virtual machines, virtual machines
# occur three words or fewer before the word "performance". In each case the
# index command will be placed on the line before the first word of the match.
#
# Words may separated by white space, including line breaks and LaTeX comments:
# the program treats "-" as part of a word, so "virtual-machine" is one word,
# "virtual machine" is a two word phrase.  Punctuation counts as a word when
# calculating the distance between words.
#
################################################################################
link options
link ximage
import xml
import threads           # Files are analysed in parallel if it looks worthwhile.

global idx               # map from search string -> index term
global idx_ex            # map from extended search string -> (index term, search list)
global perFileidx        # map from filename -> (map from search string -> index command)
global perFileidx_ex     # map from filename -> ( map from extended search string -> (index term, search list))
global suppressed        # map from filename to set of terms (which are suppressed for that file)

global whitespace
global writeNewFile      # write a new file (to &output if idxDir is null)
global idxDir            # output Directory

global logging           # enable progress/debug information on &errout
global hits              # total number of index insertions

global stats             # index statistics
global idxTerms          # index terms that were placed in the index
global idxCandidates     # All possible index terms
global noComplaints      # Suppress "not found" for these index terms
global maxPlus           # The highest number of leading "+" chars seen in a search term
global floatIdxHits      # Float index hits to outside iconline etc.
#--------------------------------------------------------------------------------

# A paragraph record holds the remaining text of the whole file, together with
# a list of lines of the current paragraph and a map from each word to a list of
# occurrences in the paragraph. An occurrence is (line no, ordinal position)
# inProgress is a set of index terms that are being defined by
# \PrimaryIndexBegin{term} ... \PrimaryIndexEnd{term}. Normal index hits for
# term are suppressed in between the PrimaryIndexBegin ... End lines.
#
record paragraph (sourceText, lines, words, inProgress)
record occurrence (line, wpos)

# A multiple word search record contains the index term to be used plus the
# search (which is a list of words, interspersed with the distance between them).
# so "Bill+Ben+3+Flowerpot" will result in searchList being
#     ["Bill", 1, "Ben", 3, "Flowerpot"]
record multiIndex (indexTerm, searchList)

#--------------------------------------------------------------------------------
procedure main(args)
   local totalms := Timer()
   local opt := options(args,
                        "-sequential -Dir: -float! -idxOnly! -idxFile: -v -V -wait+ -? -help",
                        usage)
   local n, k, perFileTerms
   local p, totalTerms
   local timeoutSecs

   &dump := 1
   #-------------------- Initialisation & option processing
   whitespace := ' \t'
   hits := 0
   maxPlus := 0
   stats := mutex(table(0))
   idxTerms := mutex(set())
   idxCandidates := set()

   logging := \opt["v"] | \opt["V"]
   if \opt["help"] | \opt["?"] then usage()

   \opt["idxOnly"] | writeNewFile := 1
   idxDir := \opt["Dir"]

   floatIdxHits := opt["float"]

   #-------------------- Read in the index terms
   totalTerms := getIndexTerms(\opt["idxFile"] | "idxterms.xml")
   if /totalTerms | (totalTerms = 0) | (*idxCandidates = 0)
   then stop("Stopping because no index terms have been defined")

   #-------------------- Index the files
   # If writing several new files, do them in parallel. Unless
   # explicitly asked to do them sequentially, or we want console output
   # or we're not running on a multi-processor, or there are too few to bother.
   if \opt["sequential"] | /writeNewFile | cores() = 1 | *args < 5 then
      every postIndex(!args)
   else {
      MakePool()
      every Dispatch(postIndex, !args, &main)

      # If no guidance (-wait n) has been given, guess that each file takes no
      # more than a second (elapsed) and we do them in parallel as much as
      # possible. Add a couple of seconds for luck.

      timeoutSecs := \opt["wait"] |  (2 + ((*args) / cores()))
      Log("Allowing ", timeoutSecs, " seconds for indexing")
      if not waitFor(*args,  timeoutSecs) then
         write(&errout, "Timed out!")
   }

   #-------------------- Post-run statistics (optional)
   if \logging then {# print some post-run statistics
      Log(stats["files"], " files processed, ",
          stats["hits"], " index entries inserted")

      n := *idxCandidates - *idxTerms - *noComplaints

      if n > 0 then {
         Log(n, " index terms were not placed (-V for list)")

         # -V (as opposed to -v) generates details of non matching terms
         if \opt["V"] then {
            every Log(!(idxCandidates -- idxTerms -- noComplaints))
         }
      }

      Log("Total CPU time ", Timer(totalms), " mS")
   }
   &dump := 0
   return  # success
end

#--------------------------------------------------------------------------------
#   Return the number of cores reported by &features
procedure cores()
   static nCores
   initial {
      nCores := 1
      if (&features ? {="concurrent threads"}) then {
         nCores := integer(&features ? {="CPU cores " & tab(0) })
      }
   }

   return nCores
end

#--------------------------------------------------------------------------------
#   Wait for n messages or a timeout of t seconds, whichever happens first.
#   Fail if fewer than n messages are received before the timeout.

procedure waitFor(nMess, secs)
   local endTime, now
   local n := 0

   # It might be better if we had a millisecond based timeout, but there is
   # nothing available with this resolution. &time isn't really suitable: it
   # measures CPU time in millisconds (but when multithreading, it increases by
   # more than 1 every millisecond, depending on how many cores are active).
   # So, for now (sic), we'll stick with seconds and &now.
   endTime := secs + (now := &now)
   if \endTime then {
      while n < nMess do {
         (1000 * (endTime - now)) <<@
            if nMess < (n +:= 1) then return # we got the required number of messages
         if 0 >= (endTime - (now := &now)) then return &fail  # timeout
      }
   } else { # No timeout: just wait
      while n < nMess do { <<@ ; n +:= 1 }
   }
   return #success
end

#--------------------------------------------------------------------------------
procedure Log(p[])
   if \logging then {push(p, &errout); write ! p}
end

#--------------------------------------------------------------------------------
procedure Timer(t)
   return if /t then &time else &time - t
end

#--------------------------------------------------------------------------------
# analyse the XML index configuration file
procedure getIndexTerms(filename: string)
   local ms := /logging | Timer()
   local line, searchTerm, indexTerm, multiTerm, searchList, searchLine
   local prs, d, root
   local x_terms, x_idx, x_ApplyTo, x_NotFound
   local files, specialFiles, totalTerms

   totalTerms := 0
   idx := table()
   idx_ex := table()
   noComplaints := set()
   suppressed := table()        # suppressed is a table of sets
   perFileidx := table()        # perFileidx is a table of tables
   perFileidx_ex := table()     # perFileidx_ex is a table of tables

   #----------------------------------------------------------------------
   # For reference: this is the structure of the configuration xml file; it's
   # basically a sequence of Terms that can have File and idx elements inside them.
   # <!DOCTYPE LaTeX_Index
   # [
   #   <!ELEMENT LaTeX_Index (Terms*)>
   #   <!ELEMENT Terms (File*,idx*)>
   #   <!ATTLIST Terms ApplyTo (All|NamedFiles|AllBut|None) "All"  NotFound (Error|Suppress) "Error">
   #   <!ELEMENT idx (#PCDATA)>
   #   <!ELEMENT File (#PCDATA)>
   # ]>
   #----------------------------------------------------------------------
   prs := XmlParser()
   if d := prs.parse(entireFile(filename)) then {
      root := d.get_root_element()
      every x_terms := root.search_children("Terms") do {
         x_ApplyTo := x_terms.get_attribute("ApplyTo")
         x_NotFound := x_terms.get_attribute("NotFound")
         if x_ApplyTo ~== "None" then {
            # Slurp any File terms and warn about possible problems
            files := [:x_terms.search_children("File").get_string_content():] | []
            if *files > 0 then {
               if x_ApplyTo == "All" then {
                  Warn("<File> has no effect when ApplyTo = \"All\"")
               }
            } else if (x_ApplyTo == ("NamedFiles" | "AllBut")) & (*files = 0) then {
               Warn("<File> should be defined when ApplyTo = \"",x_ApplyTo, "\"")
            }

            # Add each idx item to the appropriate search structure based on the
            # ApplyTo and NotFound attributes of the enclosing Term.
            every x_idx := x_terms.search_children("idx") do {
               searchLine := x_idx.get_string_content()

               # Generate every alternative search term
               every line := Alt(searchLine) do {
                  if find("???", line) then {
                     Warn("Problem analysing \"", searchLine, "\",\n",
                          "    returned value is \"", line, "\"")
                     break next # skip every alternative: go to next idx term
                     }
                  multiTerm := searchTerm := indexTerm := &null
                  if extendedSearch(line) then { # multiple word search
                     line ? {
                        tab(many(whitespace))
                        maxPlus <:= (many("+") - 1)
                        if multiTerm := tab(upto(':')) then {
                           move(1); indexTerm := tab(0)
                           searchList := multiSearch(deSpace(multiTerm))
                        } else {
                           multiTerm := tab(0)
                           indexTerm := multiTerm
                           searchList := multiSearch(deSpace(multiTerm))
                        }
                     }
                  } else { # single word search
                     line ? {
                        tab(many(whitespace))
                        maxPlus <:= (many("+") - 1)
                        if = "%" then next  # Skip commented lines

                           # An index item is either searchTerm:indexTerm
                           # or searchTerm (which means indexTerm = searchTerm)
                           if searchTerm := tab(upto(":")) then {
                              move(1); indexTerm := tab(0)
                           } else {
                              searchTerm:= indexTerm := line
                           }
                     }
                  }

                  # At this point, indexTerm is not null and either searchTerm or multiTerm
                  # (but not both) is defined

                  if /indexTerm | ( /searchTerm & /multiTerm) then {
                     write(&errout, "Failure (1) when analysing \"", line, "\"")
                     next
                     }

                  if \searchTerm & \multiTerm then {
                     write(&errout, "Failure (2) when analysing \"", line, "\"")
                     next
                     }

                  totalTerms +:= 1
                  insert(idxCandidates, indexTerm)
                  # apply the new search term to the appropriate structures
                  if x_NotFound == "Suppress" then insert(noComplaints, indexTerm)
                  if \searchTerm & \idx[searchTerm] then {
                     Warn(searchTerm, " is already defined")
                  }
                  if \multiTerm & \idx_ex[multiTerm] then {
                     Warn(multiTerm, " is already defined")
                  }

                  case x_ApplyTo of {
                     "All": { # Apply search to every file
                        idx[\searchTerm] := indexTerm
                        idx_ex[\multiTerm] := multiIndex(indexTerm, searchList)
                     }
                     "AllBut": { # Apply search to every file except the specified files
                        idx[\searchTerm] := indexTerm
                        idx_ex[\multiTerm] := multiIndex(indexTerm, searchList)
                        every /suppressed[!files] := set() # create a set, if required, for each file
                        every insert(suppressed[!files], \searchTerm)
                        every insert(suppressed[!files], \multiTerm)
                     }
                     "NamedFiles": { # Apply search only to the named files
                        every /perFileidx[!files] := table() # create a table, if required, for each file
                        every /perFileidx_ex[!files] := table() # create a table, if required, for each file
                        every perFileidx[!files][\searchTerm]:=indexTerm
                        every perFileidx_ex[!files][\multiTerm]:=multiIndex(indexTerm, searchList)
                     }
                  }
               }
            }
         }
      }
   } else {
      write(&errout, "Parsing ", filename, " failed: ", prs.get_reason())
   }

   # List any files that have special treatment
   if \logging & (*perFileidx + *perFileidx_ex + *suppressed) > 0 then {
      specialFiles := set()
      every insert(specialFiles, key(suppressed) | key(perFileidx) | key(perFileidx_ex))

      Log(*specialFiles, " Files treated specially")
      every Log("   ", !specialFiles)
   }

   Log(*idxCandidates, " index terms (",totalTerms , " search terms) acquired in ", Timer(\ms), " mS")
   return totalTerms
end

#--------------------------------------------------------------------------------
# return s stripped of leading "+" chars
procedure nonplus(s)
   s ? { tab(many("+")); return tab(0)}
end

#--------------------------------------------------------------------------------
# Remove whitespace and replace it by a single plus sign
procedure deSpace(s)
   local ans := ""
   static notSpace
   initial {
      notSpace := &ascii -- ' \t'
   }

   s ? {
      tab(many(' \t')) # remove leading whitespace
      while ans ||:= tab(many(notSpace)) do {
         if tab(many(' \t')) then ans ||:= "+"
      }
   }
   return ans
end

#--------------------------------------------------------------------------------
# Generate alternatives from s
#   A(B|b)cd            ->   ABcd , Abcd
#   word(|s)            ->   word, words
#   (E|e)at(en||ing)    ->   Eaten, Eat, Eating, eaten, eat, eating
#   science( |-)fiction ->   science fiction, science-fiction
# etc.
#   A malformed line results in "???"
procedure Alt(s)
   local prefix, alt, suffix
   if /s then return "?"
   s ? {
      if = "(" then {
         alt := tab(upto(")")) | "???" ; move(1); suffix := tab(0) | "???"
         if find("|",alt) then {
            every suspend Alt(alt) || Alt(suffix)
         } else { # no alternatives, re-enclose alt in brackets and treat as a prefix
            every suspend "(" || alt || ")" || Alt(suffix)
         }
      } else if (prefix := tab(upto("("))) then {
         suffix := tab(0) | "???"
         every suspend prefix || Alt(suffix)
      } else  if find("|") then {
         repeat {
            alt := tab(upto('|)')) | "???"
            if = "|" then {
               /alt := ""
               suspend alt
            } else {
               return tab(0)
            }
         }
      } else {
         return s
      }
   }
end

#--------------------------------------------------------------------------------
# succeeed if s is an extended search command (i.e. apart from leading plus signs,
# has a plus sign before the end of the string or before a colon char)
# Also, if it has a space (which will get turned into a plus sign).
procedure extendedSearch(s)
   local colon
   s ? {
      tab(many(whitespace))
      if = "%" then return fail  # a comment is not an extended search
      tab(many("+"))    # strip leading plus signs
      colon := find(":") | *s
      return upto('+ \t') < colon
   }
   #fail
end

#--------------------------------------------------------------------------------
# analyse "aaa+bbb+ccc ..."    or  "a+4+bbb..." and produce a list
#  ["aaa",1,"bbb",1,"ccc" ...] or  ["a", 4, "bbb" ...}
procedure multiSearch(s)
   local sl := list()
   local word, number, line

   s ? {
      tab(many(' +')) # do not put leading spaces or plus chars into the search list
      while &pos <= *s do {
         if put(sl, word := tab(upto('+'))) then {
            if ="+" then {
               number := tab(many(&digits)) | ""
               if number == "" then { # No number supplied
                  put(sl, 1)
               } else {
                  put(sl, integer(number))
                  # eat trailing "+"
                  if not tab(match("+")) then
                     Warn("\"",s,"\" is missing a trailing plus sign")
               }
            } else {
               write(&errout, "failed to analyse \"", line, "\"")
               return &fail
            }
         } else { # write the last word
            put(sl, tab(0))
            return sl
         }
      }
   }

   #fail
end

#--------------------------------------------------------------------------------
# Gobble text up to the next paragraph break; split it into lines and words.
# For each word, store the line of occurrence and it's ordinal position.
# The consumed text is removed from the source string.
# The data is returned to the caller inside the paragraph structure.
procedure nextParagraph(p: paragraph)
   local line, word, lineNumber, wordNumber, notWord, squiggly, float
   local diags
   local floatstack
   static lineSpace, notEOL, texWord, otherChars
   initial {
      lineSpace := cset("\n \t")
      notEOL := cset(&cset -- "\n")
      texWord := &letters ++ &digits ++ cset("'\\_-&")
      otherChars := &ascii -- texWord
   }
   if *p.sourceText = 0 then {
      return fail     # end of text
   }

   floatstack := []
   diags := &null
   squiggly := &null
   # Split the text up into lines up to the next paragraph boundary
   # Note that none of this causes new string space to be created:
   # We essentially have a list of pointers into the original mega-string.
   p.sourceText ? {
      while line := tab(many(notEOL)) do {
         put(p.lines, line)
         move(1)
      }
      tab(many(lineSpace)) # Eat line terminator and any other whitespace between paragraphs
      p.sourceText := p.sourceText[&pos:0]    # adjust source text to start of next para.
   }

   # Store each word, with its occurrences in the paragraph, in p.words.
   # Treat inter-word stuff that is not whitespace as a word (for the
   # purpose of calculating ordinal word positions within the paragraph).
   lineNumber := wordNumber := 0
   every line := !(p.lines) do {
      lineNumber +:= 1

      # Should really take this out, but it might come in handy to debug matching terms.
      # note that %dump can never occur normally as a word.
      if line == "%-%-%idxGen:dump" then {
         p.words["%dump"] := []                 # signal that we want debugging output
         put(p.words["%dump"], occurrence(0,0)) # for this paragraph.
         diags := 1
      }

      if filtered(line) then next # Note that filtering works across paragraphs

      line ? {
         tab(many(whitespace))
         if ="\\PrimaryIndexBegin{" then { # suppress normal indexing for this term
            insert(p.inProgress, tab(upto("}")))
            next
            } else if ="\\PrimaryIndexEnd{" then { # re-enable normal indexing
               delete(p.inProgress, tab(upto("}")))
               next
               } else if ="\\index" | ="\\protect\\index" | ="\\PrimaryIndex" then {
                  next    # ignore any indexing line
               }

         while notWord := (if /squiggly
                           then tab(upto(texWord))
                           else tab(many(otherChars)) | tab(upto(texWord)))
         do {
            if \squiggly then {
               # count { and } because we are inside \iconline { .. } or \textit {}
               notWord ? {
                  while &pos <= *notWord do {
                     if not tab(match("\\{" | "\\}")) then {
                        if ="{" then { squiggly+:=1 }
                        if ="}" then {
                           if 0 = (squiggly -:= 1) then {
                              # insert an artificial "end of iconline/textit etc."
                              wordNumber +:= 1
                              word := "%end-" || float
                              /p.words[word] := []
                              if \diags then {
                                 write("Inserting ", word ," at ", wordNumber)
                              }
                              put(p.words[word], occurrence(lineNumber, wordNumber))
                              if *floatstack > 0 then {
                                 float := pop(floatstack)
                                 squiggly := pop(floatstack)
                              } else {
                                 squiggly := &null; break
                              }
                           }
                        } else {
                           move(1)
                        }
                     }
                  }
               }
            }

            notWord := cset(notWord)
            if (*notWord > 0) & (*(notWord -- whitespace) > 0) then wordNumber +:= 1
            if member(notWord, "%") then break  # rest of line is a comment

            word := tab(many(texWord)) | ""
            # check for an edge case: if the last character is \ and the next is a %
            # it isn't actually a comment, its a quoted percent char
            if word[-1] == "\\" & match("%") then word ||:= move(1)
            # another edge case: if the last two characters are single quotes, it's
            # a LaTex close quote, not part of a word
            if word[-1] == "'" & word[-2] == "'" then {
               word := word[1:-2]
            }
            # yet another: if the last two characters are "\\" it's a line break
            # not part of the word
            if word[-1] == "\\" & word[-2] == "\\" then {
               word := word[1:-2]
            }

            if (word ~== "") then {
               if \floatIdxHits then {
                  if word == ("\\iconline" | "\\textit") then { # start counting { and }
                     if \squiggly then { # nested search
                        push(floatstack, squiggly)
                        push(floatstack,float)
                     }
                     float := word
                     squiggly := if many(otherChars) then 0 else 1
                     if \diags then {
                        write("starting ",
                              (if *floatstack > 0 then "nested " else ""),
                              "search for ", word, " count = ", squiggly)
                     }
                  }
               }
               wordNumber +:= 1
               /(p.words[word]) := []
               put(p.words[word], occurrence(lineNumber, wordNumber))
            }
         }
      }
   }

   return # success
end

#--------------------------------------------------------------------------------
# index a file by looking for index terms and placing \index commands beforehand
procedure postIndex(filename: string, waiter)
   # waiter (if non null) is the id of the thread that is waiting for everyone
   # to finish, if the index is being produced in parallel.

   local basename := noPath(filename)
   local outf, line, lineno := 0
   local indexTerm, n, word, plusWord
   local filehits := 0
   local fileTerms := set()
   local para
   local opList, occurs, search
   local w1,w2,w3,w4,w5
   local o1,o2,o3,o4,o5
   local diff1,diff2,diff3,diff4
   local sL
   local diags := &null
   local float, endfloat, sflt, eflt # start/end of \iconline \textit etc.

   critical stats: {stats["files"] +:= 1 }
   if \perFileidx[basename] then Log("Using specific index terms for ", filename)

   # choose where the output goes
   if \writeNewFile then {
      if \idxDir then {
         outf := open(idxDir || "/" || basename, "c")
         if /outf then stop("Cannot open ", idxDir, "/", basename)
      } else outf := &output
   } else {
      # We are writing index terms only: Announce the file
      outf := &output
      write(outf, "Index terms for ", filename)
   }

   para := paragraph(entireFile(filename), list(), table(), set())

   # analyse the file one paragraph at a time, placing index commands
   # before each matching line
   while nextParagraph(para) do {
      if \para.words["%dump"] then { # Output debugging info for this paragraph
         delete(para.words, "%dump") # note that %dump never occurs normally.
         Log(ximage(para.words))
         diags := 1
      }

      opList := table() # A map from line number to set of index terms
      if \writeNewFile then {
         # Look for terms applicable to all (ApplyTo="All")
         # or most (ApplyTo="AllBut) files
         # or file-specific terms

         # For each word in the paragraph, if it is an index hit insert the lines
         # where it occurs (plus the indexing term) into the output list.
         every plusWord:= word := key(para.words) do {
            every 0 to maxPlus do {# look for "word", "+word", "++word" ...
               indexTerm := (\idx[plusWord]) |
                             (\perFileidx[basename] & \perFileidx[basename][plusWord]) |
                             &null
               if \indexTerm & not member(\suppressed[basename], plusWord) then {
                  every occurs := !(para.words[word]) do {
                     /(opList[occurs.line]) := set()
                     insert(opList[occurs.line], indexTerm)
                     filehits +:= 1
                  }
               }
               plusWord := "+" || plusWord #prepend a plus and try again
            }
         }

         # For each search in the extended search tables, see if
         #   a) All the needed words are in the paragraph.
         #   b) They are within the required distances of each other.
         every search := key(idx_ex) | key(\perFileidx_ex[basename]) do {
            if member(\suppressed[basename], search) then next

               # Before checking the distances between the words, make sure that
               # all the needed words are actually in the paragraph.
               sL := (\idx_ex[search]).searchList |
                  (\perFileidx_ex[basename][search]).searchList
            every n := 1 to *sL by 2 do {
               if /(para.words[sL[n]]) then break next    # No luck: try next search term
               }

            indexTerm := (\idx_ex[search]).indexTerm | (\perFileidx_ex[basename][search]).indexTerm
            if /indexTerm then stop("error: null index term")
            if type(indexTerm) ~== "string" then {
               write(&errout, ximage(indexTerm))
               stop("IndexTerm not a string")
            }

            case *sL of {
               3: { # two words:   word1 distance word2
                  w1 := sL[1]; w2 := sL[3]
                  diff1 := sL[2]

                  # write(&errout, "Looking for ", w1, " and ", w2, " within ", diff1, " words")
                  # See if any of the positions in the paragraph of word 2 is within the
                  # required distance of any of the positions of word 1.
                  every 0 < ( ((o2 <- !(para.words[w2])).wpos) - ((o1 <- !(para.words[w1])).wpos) ) <= diff1 do {
                     /(opList[o1.line]) := set()
                     insert(opList[o1.line], indexTerm)
                     if o2.line > o1.line then {
                        /(opList[o2.line]) := set()
                        insert(opList[o2.line], indexTerm)
                     }
                  }
               }

               5: { # three words: w1 d1 w2 d2 w3
                  w1 := sL[1]; w2 := sL[3]; w3 := sL[5]
                  diff1 := sL[2]; diff2 := sL[4]
                  # write(&errout, "Looking for ", w1, " and ", w2, " within ", diff1, " words and")
                  # write(&errout, "Looking for ", w2, " and ", w3, " within ", diff2, " words")
                  every (0 < ( ((o2 <- !(para.words[w2])).wpos) - ((o1 <- !(para.words[w1])).wpos) ) <= diff1) &
                        (0 < ( ((o3 <- !(para.words[w3])).wpos) - (o2.wpos) ) <= diff2) do {
                        /(opList[o1.line]) := set()
                        insert(opList[o1.line], indexTerm)
                        if o3.line > o1.line then {
                           /(opList[o3.line]) := set()
                           insert(opList[o3.line], indexTerm)
                        }
                     }
               }

               7: { # four words
                  w1 := sL[1]; w2 := sL[3]; w3 := sL[5]; w4 := sL[7]
                  diff1 := sL[2]; diff2 := sL[4]; diff3 := sL[6]
                  # write(&errout, "Looking for ", w1, " and ", w2, " within ", diff1, " words and")
                  # write(&errout, "            ", w2, " and ", w3, " within ", diff2, " words and")
                  # write(&errout, "            ", w3, " and ", w4, " within ", diff3, " words")
                  every (0 < ( ((o2 <- !(para.words[w2])).wpos) - ((o1 <- !(para.words[w1])).wpos) ) <= diff1) &
                        (0 < ( ((o3 <- !(para.words[w3])).wpos) - (o2.wpos) ) <= diff2) &
                        (0 < ( ((o4 <- !(para.words[w4])).wpos) - (o3.wpos) ) <= diff3) do {
                        /(opList[o1.line]) := set()
                        insert(opList[o1.line], indexTerm)
                        if o4.line > o1.line then {
                           /(opList[o4.line]) := set()
                           insert(opList[o4.line], indexTerm)
                        }
                     }
               }

               9: { # five words
                  w1 := sL[1]; w2 := sL[3]; w3 := sL[5]; w4 := sL[7]; w5 := sL[9]
                  diff1 := sL[2]; diff2 := sL[4]; diff3 := sL[6]; diff4 := sL[8]

                  every (0 < ( ((o2 <- !(para.words[w2])).wpos) - ((o1 <- !(para.words[w1])).wpos) ) <= diff1) &
                        (0 < ( ((o3 <- !(para.words[w3])).wpos) - (o2.wpos) ) <= diff2) &
                        (0 < ( ((o4 <- !(para.words[w4])).wpos) - (o3.wpos) ) <= diff3) &
                        (0 < ( ((o5 <- !(para.words[w5])).wpos) - (o4.wpos) ) <= diff4) do {
                        /(opList[o1.line]) := set()
                        insert(opList[o1.line], indexTerm)
                        if o5.line > o1.line then {
                           /(opList[o5.line]) := set()
                           insert(opList[o5.line], indexTerm)
                        }
                     }
               }

               2|4|6|8: {
                  stop("Program error: search term = ", search)
               }

               # We'll have to modify the program If someone comes up with a pressing
               # need for a six word (or greater) index expression.
               default: {
                  stop("Search expression \"", search, "\" has too many words")
               }
            }

         }
      }

      if \diags then {
         write("para.lines ", ximage(para.lines))
         write("opList ", ximage(opList))
      }

      if \floatIdxHits & *opList > 0 then {
         # Float any index hits inside an iconline or textit declaration
         # to just before it.
         every float := !["\\iconline", "\\textit"] do {
            if float == key(para.words) then {
               endfloat := "%end-" || float
               if *\para.words[float] = *\para.words[endfloat] then {
                  if \diags then write("Floating ... ", float)

                  every n := 1 to *para.words[float] do {
                     sflt := para.words[float][n].line
                     eflt := para.words[endfloat][n].line
                     every lineno := (sflt + 1) to eflt do {
                        /opList[sflt] := set()
                        opList[sflt] ++:= \opList[lineno]
                        \opList[lineno] := set()
                     }
                  }
               } else {
                  write(&errout, ximage(para))
                  stop("iconline analysis error") # option - could just carry on?
               }
               if \diags then {
                  write("opList After float ", float, " ", ximage(opList))
               }
            }
         }
      }

      # write the paragraph, one line at a time with any index terms that match
      # before the line containing the first matching word
      lineno := 0
      every line := !para.lines do {
         lineno +:= 1
         every indexTerm := !\opList[lineno] do {
            filehits +:= 1
            insert(fileTerms,indexTerm)
            if not member(para.inProgress, indexTerm) then {
               write(outf, "\\protect\\index{", indexTerm, "}%")
            }
         }
         if \writeNewFile then write(outf, line);
      }
      write(outf) # paragraph separator

      # Clear the line and word info, ready for the next paragraph
      para.lines := list()
      para.words := table()
      diags := &null
   } # End of per-paragraph processing

   if *para.inProgress > 0 then {
      Warn(filename, ": These primary index terms should not still be active:")
      every Warn(filename, ": ", !(para.inProgress))
   }

   if 0 < filehits then {
      critical stats: {
         stats["hits"] +:= filehits
      }
      # insert the terms that resulted in an index command (to display
      # the terms that resulted in no hits at the end of the run)
      critical idxTerms: {
         every insert(idxTerms, !fileTerms)
         #           idxTerms ++:= fileTerms
      }
   }

   @> \waiter   # If someone is waiting, tell them a file has been finished
end

#--------------------------------------------------------------------------------
# Succeed if the line should not be analysed
procedure filtered(line :string)
   static begstr, endstr, nesting
   initial {
      nesting := table(&null)       # Use a table indexed by thread-id so we are thread-safe.
      begstr := set("\\begin{noIndex}", "\\begin{iconcode}", "\\begin{figure}",
                    "\\begin{picture}", "\\begin{tikzpicture}")
      endstr := set("\\end{noIndex}", "\\end{iconcode}", "\\end{figure}",
                    "\\end{picture}", "\\end{tikzpicture}")
   }

   if \nesting[&current] then { # Ignore everything until after the correct number of matching lines
      line ? {
         tab(many(whitespace))
         if match(!begstr) then nesting[&current] +:=1
         else if match(!endstr) then {
            if 0 = (nesting[&current] -:= 1) then {nesting[&current] := &null}
         }
      }
      return #success
   }

   # Not currently ignoring things
   line ? {
      tab(many(whitespace))
      if match("%") then return # success
      if match(!begstr) then {(\nesting[&current] +:= 1) | (nesting[&current] := 1)}
      if \nesting[&current] then return  # success
   }

   fail
end

#--------------------------------------------------------------------------------
procedure noPath(path)
   local n
   path ? {
      every n := upto('/\\:') # last file separator (if any)
      tab (integer(n) + 1)
      return tab(0)
   }
end

#--------------------------------------------------------------------------------
#   Return a string containing the entire contents of a file
procedure entireFile(name)
   local f := open(name) | stop("Cannot open ", string(name))
   local fds := stat(f) | stop("Cannot stat ", string(name))
   local s := reads(f, fds.size + 50)
   close(f)
   return s
end

#--------------------------------------------------------------------------------
procedure Warn(m[])
   push (m, "Warning: ", &errout)
   return write ! m
end

#--------------------------------------------------------------------------------
procedure usage(s)
   local cmd := "idxGen [-v] [-Dir d] [-idxFile f] [idxOnly] [-sequential] [-float] [-?] [-help] files..."

   if \s then {
      write(&errout, s, " ... type \"idxGen -?\" for full instructions. Briefly")
      write(&errout, "  ", cmd)
   } else {
      every write(&errout,
      ![
         cmd,
         "",
         "Read in a file of index terms and annotate the files with",
         " \\index{ ...} commands before the lines that contain the terms.",
         "",
         "   -Dir d      Place the indexed files in directory d.",
         "   -idxFile f  f is an xml file containing the indexing terms, one term per line.",
         "               The default file (idxTerms.xml) contains a detailed explanation.",
         "",
         "   -idxOnly    Just produce the index terms annotated with their location.",
         "",
         "   -sequential Process the files sequentially, one after another, in a single thread.",
         "               (Five files (or more) will usually be processed in parallel, if possible)",
         "",
         "   -float      Move index hits inside an \\iconline{ } or \\textit{ } declaration to",
         "               just before it.",
         "",
         "   -v          Verbose: produce progress info and details of the run.",
         "",
         "   -wait n     Wait n seconds for the parallel indexing to finish before timing out",
         "",
         "   -? -help    Produce this output",
         ""
     ])
   }
   exit()
end
This page produced by UniDoc on 2021/04/15 @ 23:59:43.