#!/usr/bin/perl -w
#---------------------------------#
# PROGRAM: scholia_formatter.pl #
#---------------------------------#
# Open the file supplied by the first argument
open (INFILE, $ARGV[0]) || die "Error opening input file";
@contents = ;
$outputFile = "new_".$ARGV[0];
open (OUTFILE, "> $outputFile");
$print_begin = 0;
$dashed_list_found = 0;
$footnote = 0;
$inside_a_list = 0;
$header_two_lines = 0;
$new_footnote_paragraph = 0;
foreach $line (@contents) {
# break at end of content
if ($line =~ //) {
last;
}
# TITLE
if ($line =~ /class="art_head/) {
$line =~ s{}{};
$line =~ s{
}{};
print OUTFILE $line;
next;
}
# BODY
if ($line =~ //) { # start printing each line only once the print actually starts
$print_begin = 1;
next;
}
if ($print_begin == 1) {
# $line =~ s/^\s+//; # trim whitespace at beginning of line
# Headers # replaces headers contained in a single line
$line =~ s{(.+)
}{$2
};
$line =~ s{(.+)
}{$2
};
$line =~ s{(.+)
}{$2
};
$line =~ s{(.+)
}{$2
};
if ($header_two_lines ge 1 && $line =~ '
') { # replaces the closing header tag on a later line
$line =~ s###;
$header_two_lines = 0;
}
if ($line =~ 'p class="art_sub1') { # replaces headers spanning multiple lines
$line =~ s{}{
};
$header_two_lines = 2;
}
if ($line =~ 'p class="art_sub2') {
$line =~ s{
}{
};
$header_two_lines = 3;
}
if ($line =~ 'p class="art_sub3') {
$line =~ s{
}{
};
$header_two_lines = 4;
}
if ($line =~ 'p class="art_sub4') {
$line =~ s{
}{
};
$header_two_lines = 4;
}
# Content
$line =~ s{
}{
};
$line =~ s{
}{
};
$line =~ s{
}{
}; # center
$line =~ s{
}{
}; # right
$line =~ s{font face="Symbol"}{span class="greek"}g; # this line shouldn't be necessary, really...
$line =~ s{}{}; # Symbol font
$line =~ s{}{};
$line =~ s{
?(.+)
?
}{$1
}; ### I haven't seen this occur yet...
$line =~ s{centered" class="art_sub2">Thesis (..?)}{centered">Thesis $1}; # Thesis statements
$line =~ s{Thesis (..?)}{Thesis $1};
$line =~ s{centered" class="art_sub2">(.*)}{centered">$1};
#$1}g;
$line =~ s{^
$}{}; # dashify lists
# $line =~ s/\x{201C}/“/; # open double quote
# $line =~ s/\x{201D}/”/; # close double quote
# $line =~ s/\x{2018}/‘/; # open single quote
# $line =~ s/\x{2019}/’/; # close single quote
# $line =~ s/\x{2026}/…/; # ellipsis
# $line =~ s/\x{2013}/–/; # en dash
# $line =~ s/\x{2014}/—/; # em dash
# Lists
if ($line =~ '- (.+)
') {
$line =~ s{- (.+)
}{- $1
};
$dashed_list_found = 1;
}
#FOOTNOTES
# if ($line =~ 'Notes
' && $footnote == 0) { # some scholias don't have this line
if ($line =~ '$' && $footnote == 0) {
$footnote = 1;
}
if ($footnote ge 1) {
$line =~ s{^\s*$}{}; # remove
$line =~ s{^\s*
$}{}; # remove
if ($line =~ m{^\s*$}) { # remove , but remember the number
$footnote = $1;
next;
}
if ($line =~ m{^ ( )?- \d*}) {
$line =~ s{^ ( )?
- \d*\.? ?}{
};
$footnote++;
$inside_a_list = 0; # to account for any footnotes not closed with
$new_footnote_paragraph = 0; # to account for any multi-paragraphed footnotes with an odd number of
tags
}
if ($line =~ '- ') { # don't replace list item tags for lists inside footnotes
$inside_a_list = 1;
}
if ($inside_a_list == 0) {
$line =~ s{
}{};
}
if ($line =~ '') {
$inside_a_list = 0;
}
if ($line =~ '
' && $new_footnote_paragraph == 0) { # multiple paragraphs in a footnote are enclosed in tags
$line =~ s{
}{
}; # assumes multiple paragraphs are denoted with two
tags
$new_footnote_paragraph = 2;
}
if ($line =~ '
' && $new_footnote_paragraph == 2) {
$line =~ s{
}{