Source file gedcom.icn
############################################################################
#
#	File:     gedcom.icn
#
#	Subject:  Procedures for reading GEDCOM files
#
#	Author:   Gregg M. Townsend
#
#	Date:     June 23, 2000
#
############################################################################
#
#   This file is in the public domain.
#
############################################################################
#
#	These procedures read and interpret GEDCOM files, a standard
#	format for genealogy databases.
#
############################################################################
#
#	gedload(f) loads GEDCOM data from file f and returns a gedcom
#	record containing the following fields:
#	    tree	root of tree of gednode records
#	    id		table of labeled nodes, indexed by @ID@
#	    fam		list of FAM nodes (marriages)
#	    ind		list of INDI nodes (individuals)
#
#	The tree is composed of gednode records R containing these fields:
#	    level	level
#	    id		ID (label), including @...@ delimiters
#	    tag		tag
#	    data	data
#	    lnum	line number
#	    parent	parent node in tree
#	    ref		referenced node, if any
#	    sub		sub-entry list
#	    hcode	unique hashcode, if INDI node
#
#	gedwalk(tree) generates the nodes of the tree in preorder.
#
#	Three procedures find descendants of a node based on a sequence
#	of identifying tag strings:
#	    gedsub(R, tag...) generates subnodes specified by tag sequence
#	    gedval(R, tag...) generates data values of those subnodes
#	    gedref(R, tag...) generates nodes referenced by those subnodes
#
#	Three procedures extract a person's name from an INDI record:
#	    gedfnf(R)	produces "John Quincy Adams" form
#	    gedlnf(R)	produces "Adams, John Quincy" form
#	    gednmf(R,f)	produces an arbitrary format, substituting
#			prefix, firstname, lastname, suffix for
#			"P", "F", "L", "S" (respectively) in f
#
#	geddate(R) finds the DATE subnode of a node and returns a string
#	of at least 12 characters in a standard form such as "11 Jul 1767"
#	or "abt 1810".  It is assumed that the input is in English.
#
#	gedfind(g,s) generates the individuals under gedcom record g
#	that are named by s, a string of whitespace-separated words.
#	gedfind() generates each INDI node for which every word of s
#	is matched by either a word of the individual's name or by
#	the birth year.  Matching is case-insensitive.
#
############################################################################

record gedcom(
   tree,	# tree of data records
   id,		# table of labeled nodes, indexed by @ID@
   fam,		# list of FAM nodes
   ind		# list of INDI nodes
)

record gednode(
   level,	# level
   id,		# ID (label), including @...@ delimiters
   tag,		# tag
   data,	# data
   lnum,	# line number
   parent,	# parent node in tree
   ref,		# referenced node, if any
   sub,		# sub-entry list
   hcode	# hashcode, if INDI node
)

$define WHITESPACE ' \t\n\r'



#  gedload(f) -- load GEDCOM data from file f, returning gedcom record.

procedure gedload(f)		#: load GEDCOM data from file f
   local line, lnum, r, curr
   local root, id, fam, ind
   local hset, h1, h2, c

   lnum := 0
   root := curr := gednode(-1, , "ROOT", "", lnum, , , [])
   id := table()
   fam := []
   ind := []

   while line := read(f) do {
      lnum +:= 1
      if *line = 0 then
         next

      if not (r := gedscan(line)) then {
         write(&errout, "ERR, line ", lnum, ": ", line)
         next
      }
      r.lnum := lnum
      r.sub := []

      if r.tag == "CONC" then {		# continuation line (no \n)
         curr.data ||:= r.data
         next
         }
      if r.tag == "CONT" then {		# continuation line (with \n)
         curr.data ||:= "\n" || r.data
         next
         }

      while curr.level >= r.level do
         curr := curr.parent
      put(curr.sub, r)
      r.parent := curr
      curr := r

      id[\r.id] := r
      case r.tag of {
         "FAM":  put(fam, r)
         "INDI":  put(ind, r)
      }
   }

   every r := gedwalk(root) do
      r.ref := id[r.data]

   hset := set()
   every r := !ind do {
      h1 := h2 := gedhi(r)
      every c := !"123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" do
         if member(hset, h2) then
            h2 := h1 || c	# add disambiguating suffix if needed
         else
            break
      insert(hset, r.hcode := h2)
      }

   return gedcom(root, id, fam, ind)
end



#  gedscan(f) -- scan one line of a GEDCOM record, returning gednode record

procedure gedscan(s)		# (internal procedure)
   local level, id, tag, data
   static alnum
   initial alnum := &letters ++ &digits ++ '_'

   s ? {
      tab(many(WHITESPACE))
      level := tab(many(&digits)) | fail
      tab(many(WHITESPACE))
      if id := (="@" || tab(upto('@') + 1)) then
         tab(many(WHITESPACE))
      tag := tab(many(alnum)) | fail
      tab(many(WHITESPACE))
      data := tab(0)
      return gednode(level, id, tag, data)
      }
end



#  gedwalk(r) -- walk GEDCOM tree, generating nodes in preorder

procedure gedwalk(r)		#: generate GEDCOM tree nodes in preorder
   suspend r | gedwalk(!r.sub)
   fail
end



#  gedsub(r, field...) -- generate subrecords with given tags
#  gedval(r, field...) -- generate values of subrecords with given tags
#  gedref(r, field...) -- generate nodes referenced by given tags

procedure gedsub(r, f[])	#: find subrecords
   local tag, x

   tag := get(f) | fail
   every x := !r.sub do {
      if x.tag == tag then
         if *f > 0 then
            suspend gedsub ! push(f, x)
         else
            suspend x
   }
end

procedure gedval(a[])		#: find subrecord values
   suspend (gedsub ! a).data
end

procedure gedref(a[])		#: find referenced nodes
   suspend \(gedsub ! a).ref
end



#  gedfnf(r) -- get name from individual record, first name first

procedure gedfnf(r)		#: get first name first
   return gednmf(r, "P F L S")
end



#  gedlnf(r) -- get name from individual record, last name first

procedure gedlnf(r)		#: get last name first
   local s
   s := gednmf(r, "L, P F S")
   s ? {
      =", "
      return tab(0)
      }
end



#  gednmf(r, f) -- general name formatter
#
#  substitutes the first name, last name, prefix, and suffix
#  for the letters F, L, P, S respectively in string f.
#  multiple spaces are suppressed.

procedure gednmf(r, f)		#: format name
   local c, s, prefix, first, last, suffix

   prefix := gedval(r, "TITL" | "NPFX") | gedval(r, "NAME", "NPFX")
   s := gedval(r, "NAME") | fail
   s ? {
      first := trim(tab(upto('/') | 0))
      ="/"
      last := trim(tab(upto('/') | 0))
      ="/"
      suffix := gedval(r, "NSFX") | ("" ~== tab(0))
   }
   s := ""
   f ? {
      while s ||:= tab(upto('PFLS ')) do {
         while c := tab(any('PFLS ')) do {
            s ||:= case c of {
               "P": \prefix
               "F": \first
               "L": \last
               "S": \suffix
               " ": s[-1] ~== " "
               }
            }
         }
      s ||:= tab(0)
      }
   return trim(s)
end



#  geddate(r) -- get date from record in standard form

procedure geddate(r)		#: get canonical date
   local s, t, w
   static ftab
   initial {
      ftab := table()
      ftab["JAN"] := "Jan";  ftab["FEB"] := "Feb"; ftab["MAR"] := "Mar"
      ftab["APR"] := "Apr";  ftab["MAY"] := "May"; ftab["JUN"] := "Jun"
      ftab["JUL"] := "Jul";  ftab["AUG"] := "Aug"; ftab["SEP"] := "Sep"
      ftab["OCT"] := "Oct";  ftab["NOV"] := "Nov"; ftab["DEC"] := "Dec"
      ftab["ABT"] := "abt";  ftab["BEF"] := "bef"; ftab["AFT"] := "aft"
      ftab["CAL"] := "cal";  ftab["EST"] := "est"
      }

   s := trim(gedval(r, "DATE"), WHITESPACE) | fail
   t := ""

   s ? while not pos(0) do {
      tab(many(WHITESPACE))
      w := tab(upto(WHITESPACE) | 0)
      t ||:= " " || (\ftab[w] | w)
   }

   if *t > 13 then
      return t[2:0]
   else
      return right(t, 12)
end



#  gedyear(r) -- get year from event record

procedure gedyear(r)		#: get year
   local d, y

   d := gedval(r, "DATE") | fail
   d ? while tab(upto(&digits)) do
      if (y := tab(many(&digits)) \ 1) >= 1000 then
         return y
end



#  gedhi -- generate hashcode for individual record
#
#  The hashcode uses two initials, final digits of birth year,
#  and a 3-letter hashing of the full name and birthdate fields.

procedure gedhi(r)		# (internal procedure)
   local s, name, bdate, bd
   static lc, uc
   initial {
      uc := string(&ucase)
      lc := string(&lcase)
      }

   s := ""
   name := gedval(r, "NAME") | ""
   name ? {
      # prefer initial of nickname; else skip unused firstname in parens
      tab(upto('"') + 1) | (="(" & tab(upto(')') + 1))
      tab(any(' \t'))
      s ||:= tab(any(&letters)) | "X"		# first initial
      tab(upto('/') + 1)
      tab(any(' \t'))
      s ||:= tab(any(&letters)) | "X"		# second initial
   }

   bdate := geddate(gedsub(r, "BIRT")) | ""
   bd := bdate[-2:0] | "00"
   if not (bd ? (tab(many(&digits)) & pos(0))) then
      bd := "99" 
   s ||:= bd || gedh3a(name || bdate)
   return map(s, lc, uc)
end



#  gedh3a(s) -- hash arbitrary string into three alphabetic characters

procedure gedh3a(s)		# (internal procedure)
   local n, d1, d2, d3, c

   n := 0
   every c := !map(s) do
      if not upto(' \t\f\r\n', c) then
         n := 37 * n + ord(c) - 32
   d1 := 97 + (n / 676) % 26
   d2 := 97 + (n / 26) % 26
   d3 := 97 + n % 26
   return char(d1) || char(d2) || char(d3)
end



#  gedfind(g, s) -- find records by name from gedcom record
#
#  g is a gedcom record; s is a string of whitespace-separated words.
#  gedfind() generates each INDI node for which every word of s
#  is matched by either a word of the individual's name or by
#  the birth year.  Matching is case-insensitive.

procedure gedfind(g, s)		#: find individual by name
   local r
   
   every r := !g.ind do 
      if gedmatch(r, s) then
         suspend r
end


#  gedmatch(r, s) -- match record against name
#
#  s is a string of words to match name field and/or birth year.
#  Matching is case sensitive.

procedure gedmatch(r, s)	# (internal procedure)
   local w

   every w := gedlcw(s) do
      (w == (gedlcw(gedval(r, "NAME")) | gedyear(gedsub(r, "BIRT")))) | fail
   return r
end



#  gedlcw(s, c) -- generate words from string s separated by chars from c
#
#  words are mapped to lower-case to allow case-insensitive comparisons

procedure gedlcw(s, c)		# (internal procedure)
   /c := '/ \t\r\n\v\f'
   map(s) ? {
      tab(many(c))
      while not pos(0) do {
         suspend tab(upto(c) | 0) \ 1
         tab(many(c))
         }
      }
   fail
end

This page produced by UniDoc on 2021/04/15 @ 23:59:45.