Fixtimes.pl
From DLXS Documentation
Main Page > Mounting Collections: Class-specific Steps > Mounting a Bib Class Collection > Preparing BibClass Data > fixtimes.pl
#!/l/local/bin/perl # # pfarber Thu Mar 23 14:23:07 2000 # # Parse times data which conforms to the timesi DTD: # # <!DOCTYPE timesi [ # <!ELEMENT timesi - - (mh)+ > # <!ELEMENT mh - - (mhead,rec+) +(hi) > # <!ELEMENT mhead - - (#pcdata|g)+ > # <!ELEMENT rec - - (k,(d190|d191|d192|d193|d194|d197|d198),e+) > # <!ELEMENT (d190|d191|d192|d193|d194|d197|d198) - - (#pcdata) > # <!ELEMENT k - - (#pcdata) > # <!ELEMENT e - - (h,(h2)?,t,r) > # # <!ELEMENT (h|h2) - - (#pcdata|g)+ > # <!ELEMENT t - - (#pcdata,g?,(c|q)?)* > # <!ELEMENT g - - (#pcdata) > # <!ELEMENT (c|q) - - (#pcdata) > # <!ELEMENT r - - ((c|q)?,(d|f)+,p,#pcdata,v?) > # <!ELEMENT (d|f) - - (#pcdata) > # <!ELEMENT p - - (#pcdata) > # <!ELEMENT v - - (#pcdata) > # # <!ELEMENT hi - - (#pcdata) > ]> # # emit the parsed data conforming to the bib DTD: /l1/lib/sgml/bib.dtd # # In the process, a data structure for each <rec> mirroring the # timesi DTD is built: # # %recHash = ( 'k' => pcdata. 'e' => [] ) # # where e[0] = ( 'h' => pcdata stripped of <g></g> # 'h2'=> pcdata stripped of <g></g> # 't' => [pcdata, pcdata stripped of <g></g>, {-1-} etc. ] # where -1- = ( 'c' => pcdata, possibly null # 'q' => pcdata ) possibly null # 'r' => () ) # # where r[0] = ( {-1-}, [-2-], {-3-}, pcdata, {-4-} ) # where -1- = ( 'c' => pcdata, possibly null # 'q' => pcdata ) possibly null # where -2-[0] = ( 'd' => pcdata, possibly null # 'f' => pcdata ) possibly null # where -3- = ( 'p' => pcdata ) # where -4- = ( 'v' => pcdata ) my %charEntHash; ($sec, $min, $hour, $mday, $mon, $year) = localtime(time); my $date = $date = ($year+1900) . $mon . $mday; # Grab the prefix of the filename arg to this program as a uniquifier # for the ID across all files processed my $idValPart = ( $ARGV[0] =~ m,(.+?)\..+, ? $1 : ''); my $id = 0; $/ = "</mh>"; &InitCharEntHash; while ($mh = <>) { my @recElementArr; # #--- parse <!ELEMENT mh - - (mhead,rec+) +(hi) > # kill <mhead> </mhead> + content -- just skip # $mh =~ s,<mhead>.*?</mhead>,,s; # kill <g> and </g> tags $mh =~ s,</?g>,,sg; # get each <rec> saving its content while ($mh =~ s,<rec>(.*?)</rec>,,s) { push(@recElementArr, $&); } #--- within a <rec> foreach $recElement (@recElementArr) { my %recHash; # rid <rec> of all \n to make parsing easier $recElement =~ s,\n,,gs; # map char ents to corresponding character # This is no handled by mungeents.pl pfarber Thu Aug 17 12:04:29 2000 # $recElement =~s,\&([A-Za-z0-9]+?);,$charEntHash{$1},g; # kill e.g. <d191></d191> + content $recElement =~ s,<d1..>.*?</d1..>,,; # map <hi> -> <I>, </hi> -> </I> gobally $recElement =~ s,<hi>,<I>,g; $recElement =~ s,</hi>,</I>,g; # grab content of <k> ($recHash{'k'}) = ($recElement =~ m,<k>(.*?)</k>,); #--- eat each <e> and process $recHash{'e'} = []; $eCount = 0; while ($recElement =~ s,<e>.*?</e>,,) { my $eElement = $&; # print $eElement . "\n"; $recHash{'e'}[$eCount] = { 'h' => "", 'h2' => "", 't' => {}, 'r' => {}, }; # the content of <h>,<h2> within this <e> ($recHash{'e'}[$eCount]{'h'}) = ($eElement =~ m,<h>(.*?)</h>,); ($recHash{'e'}[$eCount]{'h2'}) = ($eElement =~ m,<h2>(.*?)</h2>,); #--- grab the <t> element within <e> my ($tElement) = ($eElement =~ m,(<t>.*?</t>),); # kill <g>, </g> within <t> $tElement =~ s,</?g>,,; my $temp; #--- get content of <t> by parsing (#pcdata,g?,(c|q)?)* $recHash{'e'}[$eCount]{'t'}{'t_pcdata'} = ""; while ($tElement ne "<t></t>") { # grab <t> content until <c>, <q> or </t> and append ($temp) = ($tElement =~ m,<t>(.*?)<(/t|c|q)>,); $recHash{'e'}[$eCount]{'t'}{'t_pcdata'} .= $temp; # eat leading pcdata (watch out for metacharacters in $temp) if ( $temp ) { $tElement =~ s,\Q$temp,,; } # get content of (c|q)? my ( $cElement, $cElementCont ) = ( $tElement =~ m,(<c>(.*?)</c>),); $recHash{'e'}[$eCount]{'t'}{'cq_pcdata'} = $cElementCont; $tElement =~ s,\Q$cElement,, if $cElement; my ( $qElement, $qElementCont ) = ( $tElement =~ m,(<q>(.*?)</q>),); $recHash{'e'}[$eCount]{'t'}{'cq_pcdata'} = $qElementCont; $tElement =~ s,\Q$qElement,, if $qElement; } #--- grab the <r> element within <e> my ($rElement) = ($eElement =~ m,(<r>.*?</r>),); # get content of <r> by parsing ((c|q)?,(d|f)+,p,#pcdata,v?) $rElement =~ s,<r>,,; # (c|q)? ($temp) = ($rElement =~ m,<c>(.*?)</c>|<q>(.*?)</q>,); ($recHash{'e'}[$eCount]{'r'}{'cq_pcdata'}) = $temp; $rElement =~ s,<c>.*?</c>|<q>.*?</q>,,; # (d|f)+ we expect no more than one <d> or <f> ($temp) = ($rElement =~ m,<d>(.*?)</d>|<f>(.*?)</f>,); ($recHash{'e'}[$eCount]{'r'}{'df_pcdata'}) = $temp; $rElement =~ s,<d>.*?</d>|<f>.*?</f>,,; # p ($temp) = ($rElement =~ m,<p>(.*?)</p>,); $temp = "p\. $temp"; ($recHash{'e'}[$eCount]{'r'}{'p_pcdata'}) = $temp; $rElement =~ s,<p>.*?</p>,,; # grab pcdata up until <v> or </r> ($temp) = ($rElement =~ m,(.*?)<(/r|v)>,); $temp = "col\. $temp"; ($recHash{'e'}[$eCount]{'r'}{'r_pcdata'}) = $temp; $rElement =~ s,\Q$temp,,; # v? ($temp) = ($rElement =~ m,<v>(.*?)</v>,); ($recHash{'e'}[$eCount]{'r'}{'v_pcdata'}) = " $temp"; #--- Emit the data for each <e> transformed # apparently either <t> has (c|q)? or <r> has (c|q)?, exclusively my $cqContent = $recHash{'e'}[$eCount]{'t'}{'cq_pcdata'}; if (! $cqContent ) { $cqContent = $recHash{'e'}[$eCount]{'r'}{'cq_pcdata'}; } my $tContent = $recHash{'e'}[$eCount]{'t'}{'t_pcdata'}; my $dContent = $recHash{'e'}[$eCount]{'r'}{'df_pcdata'}; my $pContent = $recHash{'e'}[$eCount]{'r'}{'p_pcdata'}; my $rContent = $recHash{'e'}[$eCount]{'r'}{'r_pcdata'}; my $vContent = $recHash{'e'}[$eCount]{'r'}{'v_pcdata'}; my $kContent = $recHash{'k'}; my $hContent = $recHash{'e'}[$eCount]{'h'}; my $h2Content = $recHash{'e'}[$eCount]{'h2'}; $h2Content = $h2Content ? "<AF>" . $h2Content . "</AF>" : ""; my $idVal = "TIMES" . $idValPart . $id; printf( qq{<A ID=\"$idVal\" DT=\"%s\">} . qq{<B><K>%s</K></B>} . qq{<F><K>The Times%s</K><Z><YR>%s</YR><PG>%s, %s%s</PG></Z></F>} . qq{<H><P>%s</P></H>} . qq{<I2><KW A=\"Times\"><AF>%s</AF>%s</KW></I2></A>\n}, $date, # DT attval $tContent, # K within B $cqContent, # K within F $dContent, # YR $pContent, # PG $rContent, # PG following <p> content $vContent, # PG following <r> content $kContent, # P $hContent, # AF $h2Content # another AF ); $eCount += 1; $id +=1; } } } sub InitCharEntHash { %charEntHash = ( 'AElig' => "Æ", 'Aacgr' => "&Aacgr;", 'Aacute' => "Á", 'Abar' => "&Abar;", 'Acirc' => "Â", 'Agr' => "&Agr;", 'Agrave' => "À", 'Agvgr' => "&Agvgr;", 'Atigr' => "&Atigr;", 'Auml' => "Ä", 'Bgr' => "&Bgr;", 'Ccedil' => "Ç", 'Dgr' => "&Dgr;", 'EEacgr' => "&EEacgr;", 'EEgr' => "&EEgr;", 'EEgvgr' => "&EEgvgr;", 'EEtigr' => "&EEtigr;", 'Eacgr' => "&Eacgr;", 'Eacute' => "É", 'Ebreve' => "&Ebreve;", 'Ecirc' => "Ê", 'Egr' => "&Egr;", 'Egrave' => "È", 'Egvgr' => "&Egvgr;", 'Etigr' => "&Etigr;", 'Euml' => "Ë", 'Ggr' => "&Ggr;", 'Iacgr' => "&Iacgr;", 'Icirc' => "Î", 'Igr' => "&Igr;", 'Igvgr' => "&Igvgr;", 'Itigr' => "&Itigr;", 'Iuml' => "Ï", 'KHgr' => "&KHgr;", 'Kgr' => "&Kgr;", 'Lgr' => "&Lgr;", 'Mgr' => "&Mgr;", 'Ngr' => "&Ngr;", 'OElig' => "Œ", 'OHacgr' => "&OHacgr;", 'OHgr' => "&OHgr;", 'OHgvgr' => "&OHgvgr;", 'OHtigr' => "&OHtigr;", 'Oacgr' => "&Oacgr;", 'Obar' => "&Obar;", 'Ocirc' => "Ô", 'Ogr' => "&Ogr;", 'Ogvgr' => "&Ogvgr;", 'Otigr' => "&Otigr;", 'Ouml' => "Ö", 'PHgr' => "&PHgr;", 'PSgr' => "&PSgr;", 'Pgr' => "&Pgr;", 'Rgr' => "&Rgr;", 'Sbreve' => "&Sbreve;", 'Sgr' => "&Sgr;", 'Slungr' => "&Slungr;", 'THgr' => "&THgr;", 'Tgr' => "&Tgr;", 'Uacgr' => "&Uacgr;", 'Ucirc' => "Û", 'Ugr' => "&Ugr;", 'Ugvgr' => "&Ugvgr;", 'Utigr' => "&Utigr;", 'Uuml' => "Ü", 'Xgr' => "&Xgr;", 'Zgr' => "&Zgr;", 'aacdiagr' => "&aacdiagr;", 'aacgr' => "&aacgr;", 'aacute' => "á", 'abar' => "&abar;", 'acirc' => "â", 'adiagr' => "&adiagr;", 'adiagvgr' => "&adiagvgr;", 'adiatigr' => "&adiatigr;", 'aelig' => "æ", 'agr' => "&agr;", 'agrave' => "à", 'agvgr' => "&agvgr;", 'aposgr' => "&aposgr;", 'aring' => "å", 'ashort' => "&ashort;", 'ast' => "*", 'atigr' => "&atigr;", 'atilde' => "ã", 'auml' => "ä", 'bgr' => "&bgr;", 'cacute' => "ć", 'cbreve' => "&cbreve;", 'ccedil' => "ç", 'colgr' => "&colgr;", 'deg' => "°", 'dgr' => "d", 'dollar' => "\$", 'eacgr' => "&eacgr;", 'eacute' => "é", 'ebar' => "&ebar;", 'ebreve' => "&ebreve;", 'ecirc' => "ê", 'eeacgr' => "&eeacgr;", 'eegr' => "&eegr;", 'eegvgr' => "&eegvgr;", 'eetigr' => "&eetigr;", 'egr' => "&egr;", 'egrave' => "è", 'egvgr' => "&egvgr;", 'equals' => "=", 'etigr' => "&etigr;", 'euml' => "ë", 'frac12' => "1/2", 'frac13' => "1/3", 'frac14' => "1/4", 'frac18' => "1/8", 'frac34' => "3/4", 'frac78' => "7/8", 'fslash' => "/", 'ggr' => "&ggr;", 'gtigr' => ">igr;", 'hyphen' => "-", 'hyphgr' => "&hyphgr;", 'iacdiagr' => "&iacdiagr;", 'iacgr' => "&iacgr;", 'iacute' => "í", 'ibar' => "&ibar;", 'icirc' => "î", 'idiagr' => "&idiagr;", 'idiagvgr' => "&idiagvgr;", 'idiatigr' => "&idiatigr;", 'igr' => "&igr;", 'igrave' => "ì", 'igvgr' => "&igvgr;", 'ishort' => "&ishort;", 'isubgr' => "&isubgr;", 'itigr' => "&itigr;", 'iuml' => "ï", 'kgr' => "&kgr;", 'khgr' => "&khgr;", 'ktigr' => "&ktigr;", 'ldquo' => '"', 'lgr' => "&lgr;", 'lsquo' => "'", 'mdash' => "--", 'mgr' => "&mgr;", 'minus' => "-", 'naugr' => "&naugr;", 'ndash' => "-", 'ngr' => "&ngr;", 'ntilde' => "ñ", 'oacgr' => "&oacgr;", 'oacute' => "ó", 'obar' => "&obar;", 'obreve' => "&obreve;", 'ocirc' => "ô", 'oelig' => "œ", 'ogr' => "&ogr;", 'ograve' => "ò", 'ogvgr' => "&ogvgr;", 'ohacgr' => "&ohacgr;", 'ohgr' => "&ohgr;", 'ohgvgr' => "&ohgvgr;", 'ohmacrgr' => "&ohmacrgr;", 'ohtigr' => "&ohtigr;", 'oshort' => "&oshort;", 'oslash' => "ø", 'otigr' => "&otigr;", 'otilde' => "õ", 'ouml' => "ö", 'percent' => "%", 'pgr' => "&pgr;", 'phgr' => "&phgr;", 'pound' => "£", 'psgr' => "&psgr;", 'qugr' => "&qugr;", 'rb' => "&rb;", 'rbAacgr' => "&rbAacgr;", 'rbAgr' => "&rbAgr;", 'rbAgvgr' => "&rbAgvgr;", 'rbAtigr' => "&rbAtigr;", 'rbEEacgr' => "&rbEEacgr;", 'rbEEgr' => "&rbEEgr;", 'rbEEgvgr' => "&rbEEgvgr;", 'rbEEtigr' => "&rbEEtigr;", 'rbEacgr' => "&rbEacgr;", 'rbEgr' => "&rbEgr;", 'rbEgvgr' => "&rbEgvgr;", 'rbEtigr' => "&rbEtigr;", 'rbIacgr' => "&rbIacgr;", 'rbIgr' => "&rbIgr;", 'rbIgvgr' => "&rbIgvgr;", 'rbItigr' => "&rbItigr;", 'rbOHacgr' => "&rbOHacgr;", 'rbOHgr' => "&rbOHgr;", 'rbOHgvgr' => "&rbOHgvgr;", 'rbOHtigr' => "&rbOHtigr;", 'rbOacgr' => "&rbOacgr;", 'rbOgr' => "&rbOgr;", 'rbOgvgr' => "&rbOgvgr;", 'rbOtigr' => "&rbOtigr;", 'rbRgr' => "&rbRgr;", 'rbUacgr' => "&rbUacgr;", 'rbUgr' => "&rbUgr;", 'rbUgvgr' => "&rbUgvgr;", 'rbUtigr' => "&rbUtigr;", 'rbaacgr' => "&rbaacgr;", 'rbagr' => "&rbagr;", 'rbagvgr' => "&rbagvgr;", 'rbatigr' => "&rbatigr;", 'rbeacgr' => "&rbeacgr;", 'rbeeacgr' => "&rbeeacgr;", 'rbeegr' => "&rbeegr;", 'rbeegvgr' => "&rbeegvgr;", 'rbeetigr' => "&rbeetigr;", 'rbegr' => "&rbegr;", 'rbegvgr' => "&rbegvgr;", 'rbetigr' => "&rbetigr;", 'rbiacgr' => "&rbiacgr;", 'rbigr' => "&rbigr;", 'rbigvgr' => "&rbigvgr;", 'rbitigr' => "&rbitigr;", 'rboacgr' => "&rboacgr;", 'rbogr' => "&rbogr;", 'rbogvgr' => "&rbogvgr;", 'rbohacgr' => "&rbohacgr;", 'rbohgr' => "&rbohgr;", 'rbohgvgr' => "&rbohgvgr;", 'rbohtigr' => "&rbohtigr;", 'rbotigr' => "&rbotigr;", 'rbreve' => "&rbreve;", 'rbrgr' => "&rbrgr;", 'rbuacgr' => "&rbuacgr;", 'rbugr' => "&rbugr;", 'rbugvgr' => "&rbugvgr;", 'rbutigr' => "&rbutigr;", 'rdquo' => '"', 'rgr' => "&rgr;", 'rsquo' => "'", 'sb' => "&sb;", 'sbAacgr' => "&sbAacgr;", 'sbAgr' => "&sbAgr;", 'sbAgvgr' => "&sbAgvgr;", 'sbAtigr' => "&sbAtigr;", 'sbEEacgr' => "&sbEEacgr;", 'sbEEgr' => "&sbEEgr;", 'sbEEgvgr' => "&sbEEgvgr;", 'sbEEtigr' => "&sbEEtigr;", 'sbEacgr' => "&sbEacgr;", 'sbEgr' => "&sbEgr;", 'sbEgvgr' => "&sbEgvgr;", 'sbEtigr' => "&sbEtigr;", 'sbIacgr' => "&sbIacgr;", 'sbIdiagr' => "&sbIdiagr;", 'sbIgr' => "&sbIgr;", 'sbIgvgr' => "&sbIgvgr;", 'sbItigr' => "&sbItigr;", 'sbOHacgr' => "&sbOHacgr;", 'sbOHgr' => "&sbOHgr;", 'sbOHgvgr' => "&sbOHgvgr;", 'sbOHtigr' => "&sbOHtigr;", 'sbOacgr' => "&sbOacgr;", 'sbOgr' => "&sbOgr;", 'sbOgvgr' => "&sbOgvgr;", 'sbOtigr' => "&sbOtigr;", 'sbRgr' => "&sbRgr;", 'sbUacgr' => "&sbUacgr;", 'sbUgr' => "&sbUgr;", 'sbUgvgr' => "&sbUgvgr;", 'sbUtigr' => "&sbUtigr;", 'sbaacgr' => "&sbaacgr;", 'sbagr' => "&sbagr;", 'sbagvgr' => "&sbagvgr;", 'sbatigr' => "&sbatigr;", 'sbeacgr' => "&sbeacgr;", 'sbeeacgr' => "&sbeeacgr;", 'sbeegr' => "&sbeegr;", 'sbeegvgr' => "&sbeegvgr;", 'sbeetigr' => "&sbeetigr;", 'sbegr' => "&sbegr;", 'sbegvgr' => "&sbegvgr;", 'sbetigr' => "&sbetigr;", 'sbiacgr' => "&sbiacgr;", 'sbigr' => "&sbigr;", 'sbigvgr' => "&sbigvgr;", 'sbitigr' => "&sbitigr;", 'sboacgr' => "&sboacgr;", 'sbogr' => "&sbogr;", 'sbogvgr' => "&sbogvgr;", 'sbohacgr' => "&sbohacgr;", 'sbohgr' => "&sbohgr;", 'sbohgvgr' => "&sbohgvgr;", 'sbohtigr' => "&sbohtigr;", 'sbotigr' => "&sbotigr;", 'sbreve' => "&sbreve;", 'sbrgr' => "&sbrgr;", 'sbuacgr' => "&sbuacgr;", 'sbugr' => "&sbugr;", 'sbugvgr' => "&sbugvgr;", 'sbutigr' => "&sbutigr;", 'sfgr' => "&sfgr;", 'sgr' => "&sgr;", 'slungr' => "&slungr;", 'tblank' => " ", 'tgr' => "&tgr;", 'thgr' => "&thgr;", 'uacdiagr' => "&uacdiagr;", 'uacgr' => "&uacgr;", 'uacute' => "ú", 'ubar' => "&ubar;", 'ubreve' => "ŭ", 'ucirc' => "û", 'udiagr' => "&udiagr;", 'udiagvgr' => "&udiagvgr;", 'udiatigr' => "&udiatigr;", 'ugr' => "&ugr;", 'ugrave' => "ù", 'ugvgr' => "&ugvgr;", 'ushort' => "&ushort;", 'utigr' => "&utigr;", 'uuml' => "ü", 'wblank' => "&wblank;", 'xgr' => "&xgr;", 'yacute' => "ý", 'ygrave' => "&ygrave;", 'yuml' => "ÿ", 'z' => " ", 'zbreve' => "z", 'zgr' => "&zgr;", ) }