Source file senten1.icn
############################################################################
#
#	File:     senten1.icn
#
#	Subject:  Procedure to generate sentences
#
#	Author:   Peter A. Bigot
#
#	Date:     August 14, 1996
#
############################################################################
#
#   This file is in the public domain.
#
############################################################################
#
# sentence(f) generates the English sentences encountered in a file.
#
############################################################################
#
# The following rules describe what a 'sentence' is.
# 
# * A sentence begins with a capital letter.
# 
# * A sentence ends with one or more of '.!?', subject to other
#   constraints.
# 
# * If a period is immediately followed by:
#   - a digit
#   - a letter
#   - one of ',;:'
#   it is not a sentence end.
# 
# * If a period is followed (with intervening space) by a lower case
#   letter, it is not a sentence end (assume it's part of an abbreviation).
#
# * The sequence '...' does not end a sentence.  The sequence '....' does.
# 
# * If a sentence end character appears after more opening parens than
#   closing parens in a given sequence, it is not the end of that
#   particular sentence. (I.e., full sentences in a parenthetical remark
#   in an enclosing sentence are considered part of the enclosing
#   sentence.  Their grammaticality is in question, anyway.) (It also
#   helps with attributions and abbreviations that would fail outside
#   the parens.)
#
# * No attempt is made to ensure balancing of double-quoted (") material.
# 
# * When scanning for a sentence start, material which does not conform is
#   discarded.
# 
# * Corollary: Quotes or parentheses which enclose a sentence are not
#   considered part of it.
# 
# * An end-of-line on input is replaced by a space unless the last
#   character of the line is 'a-' (where 'a' is any letter), in which case
#   the hyphen is deleted.
#
# * Leading and trailing space (tab, space, newline) chars are removed
#   from each line of the input.
#
# * If a blank line is encountered on input while scanning a sentence,
#   the scan is aborted and search for a new sentence begins (rationale:
#   ignore section and chapter headers separated from text by newlines).
#
# * Most titles before names would fail the above constraints.  They are
#   special-cased.
#
# * This does NOT handle when a person uses their middle initial.  To do
#   so would rule out sentences such as 'It was I.',  Six of one, half-dozen
#   of the other--I made my choice.
#
# * Note that ':' does not end a sentence.  This is a stylistic choice,
#   and can be modified by simply adding ':' to sentend below.
#
############################################################################

procedure sentence (infile)
   local
      line,                     # Line read from input, beginning could be sent.
      sentence,                 # A possible sentence
      lstend,                   # Position in line of last checked sentence end
      possentp,                 # Boolean: non-null if line mod context =  sent.
      spaceskip,                # Spaces betwen EOSent and next char (context)
      nextch,                   # Next char after EOSent
      cnt,                       # Balanced count of parens in possible sent.
      t,
      newline
   static
      sentend,                  # Cset for sentence end chars
      wspace,                   # White space characters
      noperend,                 # Chars which, after period, don't end sentence
      titles                    # Titles that can appear before names.
   initial {
      sentend := '.?!'          # Initial value for sentend
      wspace := ' \t\n'         # Space chars
      noperend := &digits ++ &letters ++ ',:;' # No-end after period chars
      titles := ["Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Pres."]
      }

   line := ""
   # Repeat scanning for and suspending sentences until input fails.
   repeat {
      # Try to find the start of a sentence in the current input string.
      # If there are none, read more from file; fail if file exhausted.
      # Trim trailing space from line (leading skipped by sentence start)
      while not (line ?:= (tab (upto (&ucase)) & tab (0))) do {
         line := trim (read (infile), wspace) | fail
         }

      # Find the sentence end.  If there's no viable candidate, read more
      # from input.  Set the last end position to the first char in the
      # sentence.
      lstend := 1
      possentp := &null
      repeat {
         line ? {
            # Skip up to new stuff (scanned in previous lines).
            sentence := tab (lstend)
            while sentence ||:= tab (upto (sentend)) do {
               sentence ||:= tab (many (sentend))
               
               # Verify end-of-sentence.  Assume it doesn't pass.
               possentp := &null
               
               # Check for sentence end conformance.  See what follows it: put
               # that in nextch, and the intervening space before it in
               # spaceskip.
               # Note hack to scan in remainder of line w/o changing &pos.
               nextch := &null
               every tab (0) ? {
                  spaceskip := tab (many (wspace)) | ""
                  nextch := move (1)
                  }
                  
               if /nextch then {
                  # Don't have enough context to ensure a proper sentence end.
                  # Read more, but let readers know that this could be a
                  # sentence end (e.g., in case of EOF on input).
                  possentp := 1
                  break
                  }
               
               # Save position of last checked sentence end, so we don't try to
               # recheck this one.
               lstend := &pos
               
               # .<noperend> doesn't end a sentence.
               if (sentence [-1] == '.' &
                   spaceskip == "" &
                   any (noperend, nextch)) then {
                  next
                  }
               
               # .<spc><lcase> doesn't end sentence
               if (sentence [-1] == '.' &
                   any (&lcase, nextch)) then {
                  next
                  }

               # ... doesn't end sentence. .... does.
               if (sentence [-3:0] == "..." &
                   sentence [-4] ~== ".") then {
                  next
                  }

               # Number of ')' must be >= number '(' in sentence.
               sentence ? {
                  cnt := 0
                  while tab (upto ('()')) do {
                     if ="(" then {
                        cnt +:= 1
                        }
                     else {
                        =")"
                        cnt -:= 1
                        }
                     }
                  }
               if (cnt > 0) then {
                  next
                  }

               # Special case titles that appear before names (otherwise look
               # like sentence ends).
               every t := ! titles do {
                  if (t == sentence [- *t:0]) then {
                     # Break every, next in sentence-end search repeat
                     break next
                     }
                  }

               # This is a sentence.  Replace the line with what follows the
               # sentence, and break out of the sentence-end-search loop.
               line := tab (0)
               break break
               }
            }
         # There is no valid sentence end so far.  Remove a trailing hyphen
         # from the current line, or add a word-separating space.
         if line [-1] == '-' & any (&letters, line [-2]) then {
            line := line [1:-1]
            }
         else {
            line ||:= " "
            }

         # Read another line.  If can't, then fail--but suspend sentence first
         # if it _could_ be a sentence end.  Trim leading and trailing spaces
         # from the new line--if it's empty, toss the line so far and restart;
         # otherwise, tack it onto the end of the current line.
         if not (newline := read (infile)) then {
            if \possentp then {
               suspend (sentence)
               }
            fail
            }
         if any (wspace, newline) then {
            newline ?:= (tab (many (wspace)), tab (0))
            }
         newline := trim (newline, wspace)
         if (*newline = 0) then {
            if \possentp then {
               suspend (sentence)
               }
            line := ""
            # Break EOS check, next beginning-of-sent scan
            break next
            }
         line ||:= newline
         }

      # Suspend the sentence, then loop back for more.
      suspend sentence
      }
   end # procedure sentence

This page produced by UniDoc on 2021/04/15 @ 23:59:44.