#!/usr/bin/perl

#
# $Id: jcdiag,v 1.39 2013/07/20 15:04:45 gdg Exp $
#
#
# Copyright (c) 2013, Glenn D. Golden
#

#
# NAME
#   jcdiag - journalctl diagnostic tool
#
# SYNOPSIS
#   jcdiag [-h] [-v] [-m] [-s] [-ib=bsignore] [-il=lnignore] logfile
#
# DESCRIPTION
#   jcdiag is a utility for analyzing logfiles produced by journalctl, with
#   the specific aim of identifying boot-time messages that may have been
#   dropped by the journaling daemon, journald. This is presently the topic 
#   of a bug report filed by the author:
#
#      https://bugs.freedesktop.org/show_bug.cgi?id=66664
#
#   The underlying issue is discussed in more detail there.
#
#   jcdiag expects as input a logfile produced by journalctl using the default
#   "short" style output mode (or explicitly via "--output=short").  For the
#   results to be statistically useful, the logfile should preferably contain
#   multiple boot sequences, all of which were obtained under system conditions
#   which are as similar as feasible. (Ideally: Boot, log in as root and
#   immediately shut the system down for reboot.)
#
#   Given such input, jcdiag then makes three passes thru the logfile:
#
#     Pass 1:
#       Determine the approximate boundaries of each boot sequence. For
#	jcdiag's purposes, the bootseq boundaries are defined by the messages
#
#          "systemd XXX running in..."  # Message T0 (XXX = systemd version)
#          "Startup finished in..."     # Message T1
#
#       These definitions are intended to roughly cover the period of time
#	when systemd PID1 itself is issuing boot-related messages.
#
#     Pass 2:
#       Examine each bootseq identified in Pass 1 and filter out rogues of
#	various sorts, including bootseqs that seem too short, too peculiar,
#	or were nested or otherwise improperly terminated. Also ignored are
#	any bootseqs specified in 'bsignore' from the commandline; see below.
#
#	Then, among the remaining bootseqs, identify all messages of interest
#	("MOIs"). Presently, MOIs are defined to comprise all "Starting... "
#	and "Started..." messages issued by systemd itself, e.g.
#
#		Starting Virtual console mouse server
#		Started Network Time Service
#
#	The purpose of this pass is to form a superset list of every MOI
#	encountered during any of the analyzed boot sequences.
#
#     Pass 3:
#        Examine each bootseq accepted by Pass 2, and for each one, note which
#        MOIs from the superset list did and did not occur within the bootseq.
#
#   Finally, using the information from Passes 2 and 3, a detailed report is
#   generated, listing suspect dropped message and an estimate of the drop
#   rate for each boot sequence.  With the above explanation in hand, the
#   contents of the report should be reasonably self-explanatory. See also
#   the DISCUSSION section below.
#
#   The report filename is the orignal logfile name suffixed with ".rpt",
#   unless the logfile already had a suffix, in which case it that suffix is
#   replaced with "rpt".
#   
#   The -v option barfs detailed information to stderr about the progress
#   through each pass.
#
#   The -m option reports non-monotonic timestamps. This is really of little
#   value, since timestamps in the "short" output format are expected to
#   occasionally be out of order anyway.
#
#   The -il and -ib options allow ranges of lines and bootseqs to be ignored
#   for the purpose of the analysis. For example "-il=25-90,115-400" ignores
#   lines 25-90, and 115-400 (inclusive). "-ib="1,2,9-15" ignores bootseqs
#   1, 2, and 9-15 inclusive. Line numbers and bootseq numbers are 1-origin.
#
#   The -s option reads and reports values of various system parameters
#   which may affect the probability of journald dropping messags. Presently,
#   only the value of /proc/sys/net/unix/max_dgram_qlen is reported. NOTE:
#   These parameters are read from /proc/sys of the machine on which jcdiag
#   is executing, so obviously the information is meaningful only if that 
#   execution environment is identical to that seen by the journald instance
#   that collected the data in the logfile.
#
# DISCUSSION
#   Under the assumptions mentioned above for the logfile content, any MOI in
#   the superset list which does not also appear in the set of MOIs extracted 
#   from a particular bootseq implies that that message was probably dropped 
#   by journald during that bootseq.  This is not an airtight implication.
#   Nevertheless, under the assumptions, it is a reasonable smoking gun, and
#   the drop rates estimated based on it should not be too grossly in error. 
#
#   The rate estimates will be wrong in at least the following situations:
#
#     * If the system state is not identical across boot cycles, there may
#	be "outlier" boot sequences which generate many more MOIs than the 
#	average. This will result in a higher total superset MOI count, thus
#	biasing the dropped message rate estimate upward.
#
#     * Due to realtime loading and buffering issues, it is possible that MOIs
#	which were actually generated between T0 and T1 do not appear in the
#	logfile between the T0 and T1 messages, and therefore are not seen at
#	all by jcdiag.  This will tend to bias the dropped message rate 
#	estimate downward.
#
#     * Opposite of above: It is also possible that MOIs which were actually
#	generated prior to T0 appear in the logfile after the T0 message,
#	and thus will be processed by jcdiag.  For example, some systems invoke
#	systemd PID1 both during the initrd phase, and then again after the
#	real root filesystem is mounted. This causes two T0 messages to be
#	generated.  Presently, jcdiag pays attention only to the second one,
#	thus considering as MOIs only those messages which appear in the
#	logfile after the second systemd announcement. But if any MOIs
#	generated during the initrd invocation wind up in the logfile after
#	the second T0 message, those will count toward the superset MOI total,
#	and thus bias the dropped message rate estimate upward.
#
#   Detailed knowledge of the specific boot procedures in combination with the
#   the jcdiag report (which lists the suspected dropped messages individually)
#   should provide enough information to understand whether a particular listed
#   message was actually dropped or simply mis-counted by jcdiag for one of the
#   above reasons.
#
# AUTHOR
#   G. D. Golden, 2013.07
#

use Getopt::Long;
use English;
use File::Basename;
use FileHandle;
use warnings;


initialize();		# Initialize 
parse_cmdline();	# Parse commandline
barf_info();		# Report some high-level info
slurp_logfile();	# Suck in the logfile
top_level_parse();	# Locate bootseq boundaries.
mid_level_parse();	# Mid-level parse: More details on each bootseq.
sanity_checks();	# Sanity checks/warnings on current state of affairs
bootseq_rpt();		# Detailed report on each bootseq.
session_rpt();		# Session summary report.
cleanup();		# Clean up and go away.


#############################################################################

#
# Top-level parse (Pass 1): Identify all bootseq events in the logfile
# (even those which may later be ignored for analysis). Fill in @bs with
# 0-origin line index at which each bootseq begins ($re_bssta message) and
# ends ($re_bsend message or EOF).
#
sub top_level_parse
{
    my	$lidx = 0;		# Line index, 0-origin
    my	$lnum = 1;		# Line number, for humans, 1-origin.
    my	$ts;			# Timestamp from current line
    my	$prevts = '';		# Most recent timestamp prior to current line
    my  $t;			# Scratch.
    my	$in_bs = 0;		# State var for this loop:
				#   0:  We're not inside a bootseq event yet.
				#   1:  Inside a real bootseq event (i.e., as
				#       initiated by a "bootseq start" message)
				#   2:  Not used [yet] in jcdiag

    @bs = ();			# List of "bootseq start" lines (0-origin)
				# (Note global var.)

    rpt("Pass 1: Locate bootseq boundaries\n");
    if ($::ILSPEC ne '')
    {
	rpt("  lines $::ILSPEC ignored per cmdline\n");
	rpt("\n");
    }

    #
    # Loop over each line in the logfile.
    #
    foreach (@log)
    {
	#
	# Skip lines mentioned in the cmdline "line ignore" list.
	#
	if (inrange($lnum, @::lnignlist)) { goto skipline; }

	#
	# Skip lines starting with '#', under the assumption that such
	# lines have been manually commented out.
	#
	if (/^#/) { goto skipline; }

	#
	# Look for what appears to be a timestamp, without being too picky.
	# Then verify its legality and monotonicity in detail. In strict mode,
	# die on any timestamp corruption.  In non-strict mode, just whine
	# about the corruption, and skip this logfile line. Doesn't make sense
	# to try to continue process the line if the timestamp is fucqued.
	#
	$ts = '';		# Assume we won't find a valid timestamp.
 	if (/(^[A-Z][a-z][a-z] \d\d \d\d:\d\d:\d\d)/)  
	{
	    $ts = $1;
	    # rpt("line $lnum: ts = '$ts'\n");
	    if (validate_ts($ts) != 0)	# If timestamp corrupted...
	    {
		if ($STRICT > 0)
		{
		    die "? Corrupted timestamp, line $lnum\n";
		}
		rpt("* Corrupted timestamp, line $lnum; line ignored.\n");
		goto skipline;
	    }

	    #
	    # If we have a previous time stamp, verify present one is later.
	    # NOTE: Perl string comparison is used for the date comparison,
	    # because delsec (which uses Date::Manip) is UFS. Seems like this
	    # should be safe, since the timestamp subfields are ordered
	    # appropriately for string comparison, but no guarantees.
	    #
	    if ($prevts and ($ts lt $prevts))
	    {
		$t = sprintf("Timestamp non-monotonic, line %5d: [%s, %s]\n",
		     $lnum, $prevts, $ts);
		if ($STRICT > 1) { die "? $t"; }
		if ($RPT_NONMON) { rpt("* $t"); }
	    }
	    $prevts = $ts;
	}

	#
	# Also skip the line if we couldn't even find a timestamp.  Every type
	# of line we're interested in oughta have one. If we can't find it
	# (or if it's so corrupted we can't recognize it) flux it.
	# 
	if ($ts eq '') { goto skipline; }

	#
	# Boot sequence start message (identified by RE $::re_bssta).
	#
	if (m/$::re_bssta/)
	{
	    # rpt("%6d: $_",$lnum);

	    #
	    # If we were already processing a bootseq event when we saw this
	    # "bootseq start" message, set the present bootseq (the one we're
	    # already processing) termination code to 'nested', for later
	    # handling and ignoration. This situation is indicative of a
	    # malformed logfile in which the present bootseq was either not
	    # terminated as expected or the termination message was lost.
	    #
	    if ($in_bs == 1)	
	    {
		if ($::RPT_PASSINF)
		{
		    rpt("  line %6d: bssta nested, terminating this " .
			"bootseq\n", $lnum);
		    rpt("\n");
		}

		$t = $lidx-1;
		push(@bs, "$t nested"); 	# Termcode for outer bootseq
		$in_bs = 0;			# Not in a bootseq anymore
		redo;				# Let's reprocess it
	    }

	    #
	    # Some systems exec systemd both during the initrd phase of bootup
	    # and then again later after real / is mounted.  For our purposes,
	    # we wish to ignore the initrd instances of systemd and wait for
	    # the "real" one after / mount. Fortunately, it seems that at least
	    # some versions of systemd (205+?) announce when they are running
	    # in initrd mode by issuing a message "Running in initial RAM disk"
	    # just after starting. So look for this message over the next 10
	    # lines (10 = arbitrary value) and if we see it, ignore this bssta
	    # message.
	    #
	    if (grep(/Running in initial RAM disk/, @log[$lidx+1..$lidx+10]))
	    {
		if ($::RPT_PASSINF)
		{
		    rpt("  line %6d: bssta during initrd phase, ignored\n",
			$lnum);
		}
		goto skipline;
	    }

	    if ($::RPT_PASSINF)
	    {
		rpt("  line %6d: bssta after real root mount, " .
		    "accepted as T0\n", $lnum);
	    }

	    #
	    # In either case (already inside a bootseq or not) push the
	    # line number and "bssta" initcode onto @bs, and indicate that
	    # we are now inside a bootseq.
	    #
	    push(@bs, "$lidx bssta");
	    $in_bs = 1;		# "1" means we're inside a real bootseq now.
	}

	#
	# Boot sequence end message.
	#
	elsif (m/$::re_bsend/)
	{
	    # rpt("%6d: $_",$lnum);
	    $::RPT_PASSINF && rpt("  line %6d: ", $lnum);
	   
	    if ($in_bs == 0)		# Not in a bootseq, consider it spurious
	    {
		if ($::RPT_PASSINF)
		{
		    rpt("spurious bsend, ignored\n\n");
		}
	    }
	    elsif ($in_bs == 1)		# If we're inside a bootseq already...
	    {
		if ($::RPT_PASSINF)
		{
		    rpt("bsend in bootseq as anticipated, accepted as T1\n\n");
		}
		push(@bs, "$lidx bsend");# ...show ending line # and norm exit
	    }
	    elsif ($#bs == -1)		# If no start message seen yet, ok...
	    {
		if ($::RPT_PASSINF)
		{
		    rpt("* Ignoring early bsend at line $lnum\n");
		}
	    }
	    $in_bs = 0;			# We're not inside a bootseq anymore.
	}

	#
	# Bottom of loop processing: Flow from above or skip here for logfile
	# lines we can't (or choose not to) process.
	#
	skipline:
	{
	    ++$lidx;
	    ++$lnum;
	}
    }

    #
    # If $in_bs is still set, then the last bootseq event in the logfile never
    # terminated before the log was closed. This bootseq event is ignored:
    # Report it, then pop off the last 'bootseq start' message, so we don't get
    # confused by it when doing the analysis later.  
    #
    if ($in_bs)
    {
	rpt("* Last bootseq continues past end of logfile, " .
	    "bootseq ignored.\n");
	pop(@bs);			# Pop it off, forget about it.
    }

    #
    # Total number of bootseqs, $nbs_tot, is half the number of entries in @bs,
    # since @bs (should) contain one starting message and one terminating
    # message per bootseq. So verify $nbs_tot is even, die if not.
    #
    $nbs_tot = ($#bs+1)/2;
    if (int($nbs_tot) != $nbs_tot)	# Better be an even # of start+end recs
    {
	die "? AUUUGH! Hi-level parse inconsistency, aborting\n";
    }
    if ($nbs_tot == 0) { die("? No bootseq events found\n"); }

    rpt("Pass 1: Complete. $nbs_tot total boot sequences identified\n");
    rpt("\n");
}


#
# Mid-level parse (Passes 2 and 3):
#
#   Pass 2:
#     Loop over all bootseqs identified in Pass 1, and filter out only those 
#     which should participate in the subsequent analysis.  Bootseqs eliminated
#     in this filtering process are those intentionally ignored per the "-ib"
#     cmdline option, and those for which some boot-time error or other
#     peculiarity was noticed during parsing.
#
#     Then, for each of the filtered bootseqs, extract all encountered MOIs
#     into a global hash %::moihash. Thus, when Pass 2 is complete, %::moihash
#     comprises the superset of all MOIs over all analyzed bootseqs: The keys
#     are the MOI message texts and the values are the counts, which are all
#     set to 0. (The counts are used in Pass 3, see below.)
#
#   Pass 3:
#     Loop over all the bootseqs selected in Pass 2, and perform a diagnostic
#     analysis of each one.  Presently that analysis comprises making a per-
#     bootseq copy of %::moihash and then incrementing the count of every MOI
#     encountered within the bootseq. Thus, when complete, any MOI which has
#     a zero count in its private %::moihash copy is a candidate for having
#     been dropped, since a zero count implies that the message had been
#     encountered in some other bootseq, but not in this one.
#
sub mid_level_parse
{
    use	strict;

    my	$debug = 0;				# 1=dump %::moihash
    my	($i, $firstline, $lastline);

    #
    # Pass 2: Filter bootseqs and create %::moihash.
    #
    rpt("Pass 2: Filter bootseqs and form superset list of MOIs from those.\n");
    for ($i = 0; $i < $::nbs_tot; ++$i)		# Loop over bootseqs
    {
	$firstline = $::bs[2*$i];		# First line of $i-th bootseq
	$lastline =  $::bs[2*$i + 1];		# Last  line of $i-th bootseq
	$::bsrec[$i] = {};			# Tabula rasa
	extract_msgs($firstline, $lastline, $i);
    }

    #
    # Number of analyzed (i.e. filtered) bootseqs.
    #
    $::nbs_ana = $::nbs_tot -
		    ($::nbs_ign+$::nbs_pec+$::nbs_sht+$::nbs_err+$::nbs_nst);

    #
    # Compute/report total number of messages of interest in the superset
    # message hash. 
    #
    $::moi_tot = keys(%::moihash);
    rpt("Pass 2: Complete. %d/%d bootseqs accepted for analysis, " .
	 "%d MOIs found\n", $::nbs_ana, $::nbs_tot, $::moi_tot);
    rpt("\n");

    #
    # Debug: Dump some info about %::moihash
    #
    if ($debug)
    {
	my $t = %::moihash;			# Used/allocated bucket info
	print "moihash: $t entries\n";
	foreach (sort keys(%::moihash)) { print "$_\n"; } 
    }

    #
    # Pass 3: Analyze each bootseq selected in Pass 2, looking for candidate
    # absent MOIs
    #
    rpt("Pass 3: Identify MOIs not present in each analyzed bootseq\n");
    for ($i = 0; $i < $::nbs_tot; ++$i)		# Loop over bootseqs
    {
	$firstline = $::bs[2*$i];		# First line of $i-th bootseq
	$lastline =  $::bs[2*$i + 1];		# Last  line of $i-th bootseq
	analyze_bootseq($firstline, $lastline, $i);
    }

    rpt("Pass 3: Complete. ");
    rpt("%d bootseqs analyzed\n", $::nbs_ana);
    rpt("\n");
}

#
# Examine bootseq passed by caller, and determine whether it should be selected
# for analysis or not. If selected, extract all its MOIs into %::moihash.
#
sub extract_msgs
{
    use strict;

    my	($l0, $l1, $bsidx) = @_;# Args: (startln, endln, idx into @bsrec)
    my	($idx0, $idx1);		# Beginning/ending line indices for this bootseq
    my	$t;			# Scratch
    my	($ln);			# Current line number, 0-origin
    my	($msg);			# Extracted msg text
    my	$rp = \$::bsrec[$bsidx];# Shorthand ptr to our bsrec[] entry

    #
    # Copy %::moihash to our bsrec, so it's guaranteed to be present even if
    # this bootseq winds up being ignored (below). 
    #
    $$rp->{mh} = %::moihash;

    #  
    # Logfile line numbers and init/exit codes for this bootseq: $l0 contains
    # the starting line number and "init code", which tells how the bootseq
    # event was initiated.  $l1 contains the ending line number followed by 
    # an "exit code", which tells how the bootseq event was terminated.
    #
    ($$rp->{l0}, $$rp->{initcode}) = split(' ', $l0);
    ($$rp->{l1}, $$rp->{termcode}) = split(' ', $l1);

    #  
    # Convenience vars for the logfile line indices delimiting this bootseq.
    #
    $idx0 = $$rp->{l0};
    $idx1 = $$rp->{l1};

    #
    # From the bootseq start message, extract the start wallclock timestamp
    # into $$rp->{t0}
    #
    $::log[$idx0] =~ m/$::re_tstmp/;
    $$rp->{t0} = $1;

    #
    # From the bootseq end message, extract the finish wallclock timestamp
    # into $$rp->{t1}.
    #
    $::log[$idx1] =~ m/$::re_tstmp/;
    $$rp->{t1}  = $1;

    #
    # $$rp->{wcdur} is the wallclock-time difference between t0 and t1, i.e.
    # the difference between the wallclock timestamps of the messages we use
    # to identify the starting and ending log entries of this bootseq. This
    # interval should be roughly similar to the "user" boot time reported by
    # systemd, since our "start of bootseq" message is just the first message
    # issued by systemd, which occurs fairly close to the start of the usermode
    # portion of the boot.
    #
    $$rp->{wcdur} = delsec($$rp->{t0}, $$rp->{t1});

    #
    # Check bootseq number against various conditions for which we want to
    # ignore it for analysis. To start with, assume none of those reasons.
    #
    $$rp->{ign_cmd} = 0;
    $$rp->{ign_sht} = 0;
    $$rp->{ign_err} = 0;
    $$rp->{ign_pec} = 0;
    $$rp->{ign_nst} = 0;

    # First check this bootseq against the list of bootseqs to ignore, per
    # cmdline "-ib" option. If it's in that list, mark it as ignored for
    # this reason and go away.
    #
    if (inrange($bsidx+1, @::bsignlist)) # If bootseq in the "ignore" list...
    {
	$$rp->{ign_cmd} = 1;		 # Mark it ignored.
	++$::nbs_ign;			 # Bump the appropriate count
	if ($::RPT_PASSINF)
	{
	    rpt("  Ignoring bootseq #%03d (excluded by cmdline)\n", $bsidx+1);
	}
	return;				# Nothing else to do.
    }

    #
    # During Pass 1 if we detected that this bootseq was aborted due to error,
    # bump the associated count of errored bootseqs and return. 
    #
    if ($$rp->{termcode} eq 'bserr')
    {
	$$rp->{ign_err} = 1;		# Ignore it.
	++$::nbs_err;			# Bump the appropriate count
	if ($::RPT_PASSINF)
	{
	    rpt("  Ignoring bootseq #%03d (error during bootseq)\n", $bsidx+1);
	}
	return;
    }

    #
    # Like above, except if we had detected that this bootseq was terminated
    # due to nesting.
    #
    if ($$rp->{termcode} eq 'nested')
    {
	$$rp->{ign_nst} = 1;		# Ignore it.
	++$::nbs_nst;			# Bump the appropriate count
	if ($::RPT_PASSINF)
	{
	    rpt("  Ignoring bootseq #%03d (bootseq nested)\n", $bsidx+1);
	}
	return;
    }

    #
    # Like above, if this bootseq is suspiciously short.
    #
    if ($idx1 - $idx0 < $::TOO_SHORT)
    {
	$$rp->{ign_sht} = 1;		# Ignore it.
	++$::nbs_sht;			# Bump the appropriate count
	if ($::RPT_PASSINF)
	{
	    rpt("  Ignoring bootseq #%03d (bootseq too short)\n", $bsidx+1);
	}
	return;
    }

    #
    # Like above, if this bootseq is too peculiar.
    # TODO: Add peculiarity test
    #
    if (0)
    {
	$$rp->{ign_pec} = 1;		# Ignore it.
	++$::nbs_pec;			# Bump the appropriate count
	if ($::RPT_PASSINF)
	{
	    rpt("  Ignoring bootseq #%03d (bootseq too peculiar)\n", $bsidx+1);
	}
	return;
    }

    #
    # If we made it here, then this bootseq is accepted for analysis. Harvest
    # all its MOIs into %::moihash.
    #
    for ($ln = $idx0; $ln <= $idx1; ++$ln)
    {
	$_ = $::log[$ln];
	chomp;
	# rpt("line $ln: '$_'\n");

	#
	# Start out assuming this line doesn't contain a MOI.
	#
	$msg = '';

	#
	# Skip lines starting with '#', under the assumption that such lines
	# have been commented out by a human for de-corruption purposes.
	#
	if (/^#/) { next; }

	#
	# Skip lines specified in -il=lnignore commandline option.
	#
	if (inrange($ln, @::lnignlist)) { next; }

	#
	# "Starting..." messages issued by systemd. Set $msg to the msg text
	#
	if (/$::re_stng/)
	{
	    $msg = $1;
	}

	#
	# "Started..." messages issued by systemd. Set $msg to the msg text
	#
	elsif (/$::re_sted/)
	{
	    $msg = $1;
	}

	#
	# If this line contained a MOI, add it to %::moihash.  The hash value
	# is explicitly set to zero so we can depend on this assumption when
	# we later make per-bootseq copies of %::moihash in order to count
	# message occurrences within each bootseq.
	#
	if ($msg)
	{
	    $::moihash{$msg} = 0; 
	    # rpt("smsg: $msg\n");
	}
    }
}


#
# Analyze the bootseq passed by caller: Identify and count MOIs encountered
# within that bootseq and increment the counts in a per-bootseq copy of
# %::moihash. (That copy is maintained in our bsrec[] entry $::bsrec[$idx],
# where $idx passed in by the caller).  Thus, upon return, that hash copy is
# a fully populated record of all MOIs, both encountered and not-encountered
# in this bootseq: A zero count means that the associated MOI -- i.e. the hash
# key -- did not occur between T0 and T1 in this bootseq, and is therefore a
# candidate dropped message. 
#
sub analyze_bootseq
{
    use strict;

    my	($l0, $l1, $idx) = @_;	# Args: (startln, endln, idx into @bsrec)
    my	$t;			# Scratch
    my	$ln;			# Current line number, 0-origin
    my	$msg;			# Extracted message text
    my	$rp = \$::bsrec[$idx];	# Shorthand ptr to our bsrec[] entry
    my	%mh = %::moihash;	# Copy of %::moihash pertaining to this bootseq

    #
    # Assume no MOIs present. 
    #
    $$rp->{moi_present} = 0;

    #
    # Don't analyze bootseqs that were filtered out in Pass 2.
    #
    if ($$rp->{ign_cmd}) { return; }	# Ignored per cmdline "-ib" option
    if ($$rp->{ign_err}) { return; }	# Ignored due to boot-time error
    if ($$rp->{ign_sht}) { return; }	# Ignored because suspiciously short
    if ($$rp->{ign_pec}) { return; }	# Ignored because too peculiar
    if ($$rp->{ign_nst}) { return; }	# Ignored because nested


    #
    # Ok, get here if this bootseq is accepted for analysis.  Examine each
    # line therein, looking for MOIs.
    #
    for ($ln = $$rp->{l0}; $ln <= $$rp->{l1}; ++$ln)
    {
	$_ = $::log[$ln];
	chomp;
	# rpt("line $ln: '$_'\n");

	#
	# Start out assuming this line doesn't contain a MOI.
	#
	$msg = '';

	#
	# Skip lines starting with '#', under the assumption that such lines
	# have been commented out by a human for de-corruption purposes.
	#
	if (/^#/) { next; }

	#
	# Skip lines specified in -il=lnignore commandline option.
	#
	if (inrange($ln, @::lnignlist)) { next; }

	#
	# "Starting..." reports
	#
	if (/$::re_stng/)
	{
	    $msg = $1;
	}

	#
	# "Started..." reports
	#
	elsif (/$::re_sted/)
	{
	    $msg = $1;
	}

	#
	# If line contained a message of interest, verify it appears in
	# %mh, and increments its hash value (which is effectively a count
	# of the number of occurrences of this message in this bootseq).
	#
	if ($msg)
	{
	    if (!defined($mh{$msg}))
	    {
		die "? internal error: message '$msg' not in hash copy\n";
	    }
	    $mh{$msg}++;
	    # rpt("added $msg to bootseq %d\n", $idx+1);
	}
    }

    #
    # %mh now contains the counts of each MOI found in this bootseq, copy
    # it to our @bsrec entry.
    #
    $$rp->{mh} = \%mh;

    #
    # Count the total number of MOIs pressent in this bootseq. Note that
    # if a each present MOI is counted only once, even if it occurs
    # multiple times.
    # 
    foreach (values($$rp->{mh}))
    {
	if ($_ > 0) { $$rp->{moi_present}++; }
    }
}


#
# Barf some basic high-level info to the reportfile.
#
sub barf_info
{
    $t = versinfo();
    rpt("\n$t\n");
    rpt("parsing strictness level: $STRICT\n");

    #
    # Report header: Version info, some key parms.
    #
    printf($rptfh "#\n# Generated by $t at $rptdate\n#\n");
    printf($rptfh "# commandline args:\t\t$cmdargs\n");
    printf($rptfh "# bootseq ignore list:\t\t%s\n",
			$IBSPEC eq '' ? 'None' : $IBSPEC);
    printf($rptfh "# line number ignore list:\t%s\n",
			$ILSPEC eq '' ? 'None' : $ILSPEC);
    printf($rptfh "# logfile:\t\t\t$logfn\n");
    printf($rptfh "# report file:\t\t\t$rptfn\n");
    printf($rptfh "# parsing strictness level:\t$STRICT\n");
    if ($RPT_SYSINF)
    {
	printf($rptfh "# system datagram qlen:\t\t");
	if ($sys_dgram_qlen)
	{
	    printf($rptfh "$sys_dgram_qlen\n");
	}
	else
	{
	    printf($rptfh "(cannot read $QLEN_FILE)\n")
	}
    }
    printf($rptfh "#\n\n");		# Separator between header and body 
}


#
# Misc sanity checks and warning issuances on what we've found so far.
#
sub sanity_checks
{
    #
    # FIXME: Any reasonable tests here?
    #
}


#
# Per-bootseq reporting: Go thru the list of bootseqs, analyze each one and
# report the details of each to the reportfile.
#
sub bootseq_rpt
{
    my	$t;				# Scratch
    my	$rp;				# Ptr to per-bootseq @bsrec entry
    my	($l0, $l1);			# Shorthand for bsrec[]->{l0}, {l1}
    my	($l0msg, $l1msg);		# Extracted msg text from l0, l1
    my	$ver;				# systemd version extracted from $l0
    my	%mh;				# Convenience copy of $$rp->{mh}
    my	$bsnum;				# Convenience var: Bootseq #, 1-origin
    my	$droprate;			# Per-bootseq estimated msg drop rate
	

    for ($i = 0; $i < $nbs_tot; ++$i)	# Loop over bootseqs
    {
	$bsnum = $i + 1;		# Human style bootseq number
	$rp = \$bsrec[$i];
	$l0 = $$rp->{l0};		# Shorthand
	$l1 = $$rp->{l1};

	#
	# Extract message text and systemd version from bootseq start message
	#
	$::log[$l0] =~ m/$::re_bssta/;
	$l0msg = substr($1,0,50) . '...';
	$ver = $2;

	#
	# Extract message text from bootseq end message.
	#
	$::log[$l1] =~ m/$::re_bsend/;
	$l1msg = substr($1,0,50) . '...';

	%mh = %{$$rp->{mh}};		# Convenience copy 

	#
	# Estimated per-bootseq message drop rate is the "gap" between the
	# superset total MOI count and the number of MOIs in this bootseq,
	# normalized by the total count.
	# 
	$droprate = 100*($::moi_tot - $$rp->{moi_present}) / $::moi_tot;


	#
	# Report the basic info for this bootseq.
	#
	printf($rptfh "  Bootseq #%03d:\t\tLogfile lines %d - %d  (%d lines)\n",
			    $bsnum, $l0+1, $l1+1, $l1-$l0+1);
	printf($rptfh "    Systemd version:\t$ver\n");
	printf($rptfh "    T0:\t\t\t$$rp->{t0}\n");
	printf($rptfh "    \t\t\t\"$l0msg\"\n");
	printf($rptfh "    T1:\t\t\t$$rp->{t1}\n");
	printf($rptfh "    \t\t\t\"$l1msg\"\n");
	printf($rptfh "    T1-T0:\t\t%d s\n", $$rp->{wcdur});

	#
	# Header for the "report counts" line, which we fill in below
	# as we go.  It might say "none", or it might have the actual numbers.
	#
	printf($rptfh "    MOIs present:\t");

	#
	# If bootseq flagged for ignoration, report the reason and skip it.
	#
	if ($$rp->{ign_cmd})
	{
	    printf($rptfh "None (bootseq ignored per cmdline)\n\n\n");
	    next;
	}

	if ($$rp->{ign_sht})
	{
	    printf($rptfh "None (bootseq too short)\n\n\n");
	    next;
	}

	if ($$rp->{ign_err})
	{
	    printf($rptfh "None (error during bootseq)\n\n\n");
	    next;
	}

	if ($$rp->{ign_pec})
	{
	    printf($rptfh "None (bootseq too peculiar)\n\n\n");
	    next;
	}

	if ($$rp->{ign_nst})
	{
	    printf($rptfh "None (bootseq nested)\n\n\n");
	    next;
	}


	#
	# Report number of MOIs present in this bootseq/total number of MOIs,
	# and the drop rate.
	#
	printf($rptfh "%d/%d\n", $$rp->{moi_present}, $::moi_tot);
	printf($rptfh "    Est. drop rate:\t%.0f%%\n", $droprate);

	#
	# Report text of each absent MOI
	#
	printf($rptfh "    Missing MOIs:\n");

	#
	# Report the text of all MOIs which were not present in this bootseq.
	#
	foreach (sort(keys(%mh)))
	{
	    if ($mh{$_} == 0)
	    {
		printf($rptfh "\t\t\t%s\n", substr($_,0,45));
	    }
	}


	printf($rptfh "\n\n");
	printf($rptfh "\n");

	if ($::RPT_PASSINF)
	{
	    rpt("  Bootseq #%03d: %d/%d MOIs present, drop rate ~= %3.0f%%\n",
		   $bsnum, $$rp->{moi_present}, $::moi_tot, $droprate);
	}
    }
}


#
# Issue summary report.
#
sub session_rpt
{
    use strict;

    printf($::rptfh "Summary:\n");
    printf($::rptfh "  Total bootseq events:\t\t%4d\n", $::nbs_tot);
    printf($::rptfh "    Ignored (per cmdline): \t%4d\n", $::nbs_ign);
    printf($::rptfh "    Ignored (nested):      \t%4d\n", $::nbs_nst);
    printf($::rptfh "    Ignored (too peculiar):\t%4d\n", $::nbs_pec);
    printf($::rptfh "    Ignored (too short):   \t%4d\n", $::nbs_sht);
    printf($::rptfh "    Ignored (boot error):  \t%4d\n", $::nbs_err);
    printf($::rptfh "    Analyzed:              \t%4d\n", $::nbs_ana);
}


#
# Initialize main::* vars
#
sub initialize
{
    #
    # Cmdline parm defaults.
    #
    $STRICT = 1;		# Strictness level
    $RPT_NONMON = 0;		# Don't report non-mono timestamps. 
    $RPT_PASSINF = 0;		# Don't report detailed per-pass info

    # 
    # Misc constants
    #
    $TOO_SHORT = 100;		# Reject bootseqs with fewer than this # lines

    #
    # Pseudo-file containing this system's max datagraph queue length
    # parameter, as reported by -s option. 
    #
    $QLEN_FILE = '/proc/sys/net/unix/max_dgram_qlen';

    #
    # Misc inits
    #
    @bsignlist = ();		# List of bootseqs to ignore, from cmdline.
    @lnignlist = ();		# List of line #'s to ignore, from cmdline.
    $nbs_tot = 0;		# Total # of bootseq's found.
    $nbs_ana = 0;		# # bootseqs accepted for analysis
    $nbs_sht = 0;		# # bootseqs too short to analyze
    $nbs_pec = 0;		# # bootseqs too peculiar to analyze
    $nbs_err = 0;		# # bootseqs aborted due to boot error 
    $nbs_nst = 0;		# # bootseqs terminated due to nesting.
    $nbs_ign = 0;		# # bootseqs ignored per cmdline

    #
    # Pull/chomp current date before setting INPUT_RECORD_SEPARATOR for DOS.
    #
    $rptdate = `date`;
    chomp $rptdate;

    #
    # Assume unix line endings for journalctl output processing.
    #
    $INPUT_RECORD_SEPARATOR = "\n";	# Var name assumes "use English"

    #
    # RE for extracting wallclock timestamps. Just the first 15 chars.
    #
    $re_tstmp = '(^.{15})';

    #
    # RE used to identify message we interpret as the start of a bootseq.
    # $1 is the leading part of the message; $2 is the reported systemd version.
    #
    $re_bssta = '(systemd (\d+) running in.*)';	#

    #
    # RE used to idenfity the end of a boot sequence.
    #
    $re_bsend = '(Startup finished in.*)';

    #
    # RE for "Starting..." messages issued by systemd. $1 is the msg text
    #
    $re_stng = '^.*systemd\[\d+\].*(Starting\s*.*$)';

    #
    # RE for "Started..." messages issued by systemd. $1 is the msg text
    #
    $re_sted = '^.*systemd\[\d+\].*(Started\s*.*$)';
}

#
# Usage barf
#
sub usage
{
    my 	$t;

    $t =
	   "\n" .
	   "usage:\n" .
	   "  jcdiag [-h] [-v] [-m] [-ib=bsignore] [-il=lnignore] logfile\n" .
	   "    -h    This help text\n" .
	   "    -v    Verbosely report info from each pass to stderr\n" .
	   "    -m    Report non-monotonic timestamps.\n" .
	   "    -s    Report miscellaneous system info of interest\n" .
	   "    -ib   Ignore bootseq numbers specified in 'bsignore'.\n" .
	   "          Example: '-ib=3-7,10,19' [No spaces allowed]\n" .
	   "    -il   Ignore logfile line numbers specified in 'lnignore'.\n" .
	   "          Example: '-il=30-70,100-192' [No spaces allowed]\n";

    rpt($t);
    rpt("\n");
    exit(-1);
}


#
# Cmdline parse. 
#
sub parse_cmdline
{
    #
    # Assume -ib and -il options absent, hence associated range specs = null.
    #
    $IBSPEC = '';
    $ILSPEC = '';

    $cmdargs = join(' ', @ARGV);

    Getopt::Long::Configure('no_ignore_case'); # Option tags case sensitive
    Getopt::Long::Configure('require_order');  # Require opts before other args
    GetOptions (
		    "ib=s"       => \$opt_ib,
		    "il=s"       => \$opt_il,
		    "m"          => \$opt_m,
		    "v"          => \$opt_v,
		    "h"          => \$opt_h,
		    "s"		 => \$opt_s,
		) or exit(-1);

    if (@ARGV != 1)	{ usage();		}
    if ($opt_h)		{ usage();		}
    if ($opt_v)		{ $RPT_PASSINF = 1;	}
    if ($opt_m)		{ $RPT_NONMON  = 1;	}
    if ($opt_s)		{ $RPT_SYSINF  = 1;	}

    if ($opt_ib)
    {
	$IBSPEC = $opt_ib;
	my @ign = intspec_canon($IBSPEC, 10000);
	if ($ign[0] < 0) { die "? bsignore range spec error $ign[0]\n"; }
	@bsignlist = @ign;
    }

    if ($opt_il)
    {
	$ILSPEC = $opt_il;
	my @ign = intspec_canon($ILSPEC, 10000000);
	if ($ign[0] < 0) { die "? lnignore range spec error $ign[0]\n"; }
	@lnignlist = @ign;
    }

    # rpt("bsignlist: '@bsignlist'\n");
    # rpt("lnignlist: '@lnignlist'\n");

    #
    # Logfile name ($logfn) from cmdline.  Report and dump output filenames
    # are the basename of $logfn concatenated with appropriate extension.
    #
    $logfn = $ARGV[0];
    $rptfn = fileparse($logfn, qw{\.[^.]*}) . '.rpt';	# Report

    #
    # Open the report file so on return, mainline can start using it.
    #
    open($rptfh, ">$rptfn") or die "? unable to create $::rptfn\n";

    #
    # If -s option, collect some info about the execution environment.
    #
    if ($RPT_SYSINF)
    {
	# See if we can actually read $QLEN_FILE. If not, warn and set the
	# value to null string so we know later.
	if (-r $QLEN_FILE)
	{
	    $sys_dgram_qlen = `cat $QLEN_FILE`;
	    chomp $sys_dgram_qlen;
	}
	else
	{
	    $sys_dgram_qlen = '';
	    rpt("WARNING: Cannot read $QLEN_FILE for -s option\n");
	}
    }
}


#
# Format and return RCS version info string.
#
sub versinfo
{
    use strict;
    my	($progname, $version, $locker);
    my	$verstring;

    ($progname = '$RCSfile: jcdiag,v $') =~ s/.RCSfile: //;
    $progname =~ s/,v .$//;

    ($version = '$Revision: 1.39 $') =~ s/.Revision: //;
    $version =~ s/ .$//;

    ($locker = '$Locker:  $') =~ s/.Locker: //;
    $locker =~ s/ .$//;

    $verstring = sprintf("$progname v$version");
    if ($locker !~ /^\s*$/)
    {
	$verstring .= sprintf(" (locked by $locker)");
    }

    return $verstring;
}

#
# Slurp logfile into @::log
#
sub slurp_logfile
{
    my	$logfh;

    if (!open($logfh, $logfn))
    {
	die "? unable to open $logfn\n";
    }
    rpt("reading $logfn... ");
    @log = <$logfh> or die;
    rpt("%d lines\n", $#log+1);
    rpt("\n");
}

#
# Clean up after a run. 
#
sub cleanup
{
    #
    # Close report and data dump files.
    #
    close($::rptfh);
}


#
# Validate a timestamp of the form Mmm dd hh:mm:ss. Zero return = OK, else
# error.
#
sub validate_ts
{
    use strict;

    my	($ts) = @_;
    my	($tslen);
    my	($tim);
    my	($mo, $da);
    my	($hr, $mi, $se);
    my  %molist = ('Jan',  1, 'Feb',  2, 'Mar',  3, 'Apr',  4,
		   'May',  5, 'Jun',  6, 'Jul',  7, 'Aug',  8,
		   'Sep',  9, 'Oct', 10, 'Nov', 11, 'Dec', 12);

    #
    # Basic length check: Mmm dd hh:mm:ss should be length 15 always.
    #
    $tslen = length($ts);
    if ($tslen != 15) { return -1; }

    #
    # Split into month, day, and time. Verify ' ' delimiters present first,
    # so don't attempt split() on malformed string.
    #
    if (substr($ts,3,1) ne ' ' or substr($ts,6,1) ne ' ') { return -2; }
    ($mo, $da, $tim) = split(' ', $ts);

    #
    # Validate month abbreciation.
    #
    if (!defined($molist{$mo})) { return -20;}

    #
    # Day of month range check
    #
    if ($da < 1 or $da > 31)      { return -23; }

    #
    # Time: Basic syntax check
    #
    if (!($tim =~ /([0-2][0-9]):([0-5][0-9]):([0-5][0-9])/))
    {
	return -30;
    }

    #
    # Time: Subfield range checks
    #
    ($hr, $mi, $se) = ($1, $2, $3);
    if ($hr < 0 or $hr > 23)  { return -31; }
    if ($mi < 0 or $mi > 59)  { return -32; }
    if ($se < 0 or $se > 59)  { return -33; }

    return 0;
}

#
# Shorthand for printf(STDERR ...);
#
sub rpt
{
    printf(STDERR @_);
}

#
# Return canonicalized rangelist array based on supplied intspec, or negative
# scalar if intspec is invalid.
#
sub intspec_canon($$)
{
    my	($spec, $hlim) = @_;	# Parms: Spec and hi limit
    my	($r0, $r1);		# First, last elements in range
    my	@ret = ();		# returned array

    if ($hlim < 0) { return -4; }

    #
    # Split the specification into fields, classify each, set $r0 and $r1
    # accordingly, and then append the ($r0, $r1) pair to @ret. Note that
    # $r0 and $r1 will be the same value for single-number fields.
    #
    foreach (split(/,/, $spec))		# Loop over each field in the spec
    {
	if (/^[0-9]+$/)			# Simple number
	{
	    $r0 = $r1 = $_;
	}

	elsif (/^[0-9]+\-[0-9]+/)	# Explicitly limited range
	{
	    ($r0, $r1) = split(/\-/);
	}

	elsif (/^[0-9]+\-$/)		# Implicitly upper-limited range
	{
	    ($r0) = split(/\-/);	# $r0 is the supplied left element 
	    $r1 = $hlim;		# $r1 is $hlim
	}

	elsif (/^$/)			# Null field. Just ignore it.
	{
	    next;
	}

	else				# Bzzzzt. 
	{
	    return (-1);		# No need to continue, it's hosed
	}

	if ($r1 < $r0)			# Ordering error
	{
	    return (-2);
	}

	if ($r1 > $hlim)		# Hi limit past the end, bag it
	{
	    return (-3);
	}

	@ret = (@ret, $r0, $r1);	# Append selection to @ret
    }

    return @ret;
}

#
# NAME
#   delsec - difference in seconds (incl. fractional seconds) between two dates
#
# SYNOPSIS
#   $seconds = delsec($datestring0, $datestring1);
#
# DESCRIPTION
#   Return the difference in seconds, including fractional seconds, between
#   the supplied dates, with $datestring0 assumed to be the earlier.
#
#   Tested with input dates in format of $ds0 and $ds1 above, and a few
#   variations thereof.  Assumes that fractional seconds, if supplied in 
#   either date, are right-anchored in the date string, without trailing 
#   whitespace.
#
sub delsec
{
    use Date::Manip;

    my	($ds0, $ds1) = @_;	# Args: Caller-supplied free-form dates
    my	($fs0, $fs1) = ('0.0', '0.0');	# Fractional seconds: Default to "0.0"
    my	($date0, $date1);
    my	($udate0, $udate1);
    my	$nargs = 1 + $#_;

    if ($nargs != 2)
    {  
	die "? Internal error: delsec() called with $nargs args\n";
    }


    #
    # Parse off the fractional seconds from each date, if there are any.
    # These are used later as a "correction" to the delta-date computed by
    # the difference between the Unix-epoch dates, since that delta doesn't
    # include fractional seconds.
    #
    $fs0 = ($ds0 =~ m/(\.[0-9]*$)/)	? $1 : '0.0';
    $fs1 = ($ds1 =~ m/(\.[0-9]*$)/)	? $1 : '0.0';
    #print "ds0 = $ds0,   ds1 = $ds1\n";
    #print "fs0 = $fs0, fs1 = $fs1\n";

    #
    # Convert each date to Date::Manip internal format.
    #
    $date0 = ParseDate($ds0);
    $date1 = ParseDate($ds1);
    #print "'$date0'\n";
    #print "'$date1'\n";

    #
    # Convert internal format dates to epoch seconds.
    #
    $udate0 = UnixDate($date0, "%s");
    $udate1 = UnixDate($date1, "%s");
    #print "'$udate0'\n";
    #print "'$udate1'\n";

    #
    # Compute delta-epoch-seconds, including the fractional part. Note the
    # implicit string-numeric -> numeric conversion of the fractional part.
    # I love perl.
    # 
   
    return ($udate1 + $fs1) - ($udate0 + $fs0);
}


#
# Return nonzero if $n is within [assumedly canonicalized] @rangelist,
# zero otherwise. If @rangelist is empty, return zero.
#
sub inrange
{
    my	($n, @rangelist) = @_;

    #
    # For each pair of values in @rangelist, check if $n is in the interval.
    # This also correctly returns zero if @rangelist is empty.
    # 
    for (my $i = 0; $i < @rangelist; $i += 2)
    {
	if ($n >= $rangelist[$i] and $n <= $rangelist[$i+1]) { return 1; }
    }
    return 0;
}


#
# vim:filetype=perl 
#