#!/usr/bin/perl # # $Id: jcdiag,v 1.39 2013/07/20 15:04:45 gdg Exp $ # # # Copyright (c) 2013, Glenn D. Golden # # # NAME # jcdiag - journalctl diagnostic tool # # SYNOPSIS # jcdiag [-h] [-v] [-m] [-s] [-ib=bsignore] [-il=lnignore] logfile # # DESCRIPTION # jcdiag is a utility for analyzing logfiles produced by journalctl, with # the specific aim of identifying boot-time messages that may have been # dropped by the journaling daemon, journald. This is presently the topic # of a bug report filed by the author: # # https://bugs.freedesktop.org/show_bug.cgi?id=66664 # # The underlying issue is discussed in more detail there. # # jcdiag expects as input a logfile produced by journalctl using the default # "short" style output mode (or explicitly via "--output=short"). For the # results to be statistically useful, the logfile should preferably contain # multiple boot sequences, all of which were obtained under system conditions # which are as similar as feasible. (Ideally: Boot, log in as root and # immediately shut the system down for reboot.) # # Given such input, jcdiag then makes three passes thru the logfile: # # Pass 1: # Determine the approximate boundaries of each boot sequence. For # jcdiag's purposes, the bootseq boundaries are defined by the messages # # "systemd XXX running in..." # Message T0 (XXX = systemd version) # "Startup finished in..." # Message T1 # # These definitions are intended to roughly cover the period of time # when systemd PID1 itself is issuing boot-related messages. # # Pass 2: # Examine each bootseq identified in Pass 1 and filter out rogues of # various sorts, including bootseqs that seem too short, too peculiar, # or were nested or otherwise improperly terminated. Also ignored are # any bootseqs specified in 'bsignore' from the commandline; see below. # # Then, among the remaining bootseqs, identify all messages of interest # ("MOIs"). Presently, MOIs are defined to comprise all "Starting... " # and "Started..." messages issued by systemd itself, e.g. # # Starting Virtual console mouse server # Started Network Time Service # # The purpose of this pass is to form a superset list of every MOI # encountered during any of the analyzed boot sequences. # # Pass 3: # Examine each bootseq accepted by Pass 2, and for each one, note which # MOIs from the superset list did and did not occur within the bootseq. # # Finally, using the information from Passes 2 and 3, a detailed report is # generated, listing suspect dropped message and an estimate of the drop # rate for each boot sequence. With the above explanation in hand, the # contents of the report should be reasonably self-explanatory. See also # the DISCUSSION section below. # # The report filename is the orignal logfile name suffixed with ".rpt", # unless the logfile already had a suffix, in which case it that suffix is # replaced with "rpt". # # The -v option barfs detailed information to stderr about the progress # through each pass. # # The -m option reports non-monotonic timestamps. This is really of little # value, since timestamps in the "short" output format are expected to # occasionally be out of order anyway. # # The -il and -ib options allow ranges of lines and bootseqs to be ignored # for the purpose of the analysis. For example "-il=25-90,115-400" ignores # lines 25-90, and 115-400 (inclusive). "-ib="1,2,9-15" ignores bootseqs # 1, 2, and 9-15 inclusive. Line numbers and bootseq numbers are 1-origin. # # The -s option reads and reports values of various system parameters # which may affect the probability of journald dropping messags. Presently, # only the value of /proc/sys/net/unix/max_dgram_qlen is reported. NOTE: # These parameters are read from /proc/sys of the machine on which jcdiag # is executing, so obviously the information is meaningful only if that # execution environment is identical to that seen by the journald instance # that collected the data in the logfile. # # DISCUSSION # Under the assumptions mentioned above for the logfile content, any MOI in # the superset list which does not also appear in the set of MOIs extracted # from a particular bootseq implies that that message was probably dropped # by journald during that bootseq. This is not an airtight implication. # Nevertheless, under the assumptions, it is a reasonable smoking gun, and # the drop rates estimated based on it should not be too grossly in error. # # The rate estimates will be wrong in at least the following situations: # # * If the system state is not identical across boot cycles, there may # be "outlier" boot sequences which generate many more MOIs than the # average. This will result in a higher total superset MOI count, thus # biasing the dropped message rate estimate upward. # # * Due to realtime loading and buffering issues, it is possible that MOIs # which were actually generated between T0 and T1 do not appear in the # logfile between the T0 and T1 messages, and therefore are not seen at # all by jcdiag. This will tend to bias the dropped message rate # estimate downward. # # * Opposite of above: It is also possible that MOIs which were actually # generated prior to T0 appear in the logfile after the T0 message, # and thus will be processed by jcdiag. For example, some systems invoke # systemd PID1 both during the initrd phase, and then again after the # real root filesystem is mounted. This causes two T0 messages to be # generated. Presently, jcdiag pays attention only to the second one, # thus considering as MOIs only those messages which appear in the # logfile after the second systemd announcement. But if any MOIs # generated during the initrd invocation wind up in the logfile after # the second T0 message, those will count toward the superset MOI total, # and thus bias the dropped message rate estimate upward. # # Detailed knowledge of the specific boot procedures in combination with the # the jcdiag report (which lists the suspected dropped messages individually) # should provide enough information to understand whether a particular listed # message was actually dropped or simply mis-counted by jcdiag for one of the # above reasons. # # AUTHOR # G. D. Golden, 2013.07 # use Getopt::Long; use English; use File::Basename; use FileHandle; use warnings; initialize(); # Initialize parse_cmdline(); # Parse commandline barf_info(); # Report some high-level info slurp_logfile(); # Suck in the logfile top_level_parse(); # Locate bootseq boundaries. mid_level_parse(); # Mid-level parse: More details on each bootseq. sanity_checks(); # Sanity checks/warnings on current state of affairs bootseq_rpt(); # Detailed report on each bootseq. session_rpt(); # Session summary report. cleanup(); # Clean up and go away. ############################################################################# # # Top-level parse (Pass 1): Identify all bootseq events in the logfile # (even those which may later be ignored for analysis). Fill in @bs with # 0-origin line index at which each bootseq begins ($re_bssta message) and # ends ($re_bsend message or EOF). # sub top_level_parse { my $lidx = 0; # Line index, 0-origin my $lnum = 1; # Line number, for humans, 1-origin. my $ts; # Timestamp from current line my $prevts = ''; # Most recent timestamp prior to current line my $t; # Scratch. my $in_bs = 0; # State var for this loop: # 0: We're not inside a bootseq event yet. # 1: Inside a real bootseq event (i.e., as # initiated by a "bootseq start" message) # 2: Not used [yet] in jcdiag @bs = (); # List of "bootseq start" lines (0-origin) # (Note global var.) rpt("Pass 1: Locate bootseq boundaries\n"); if ($::ILSPEC ne '') { rpt(" lines $::ILSPEC ignored per cmdline\n"); rpt("\n"); } # # Loop over each line in the logfile. # foreach (@log) { # # Skip lines mentioned in the cmdline "line ignore" list. # if (inrange($lnum, @::lnignlist)) { goto skipline; } # # Skip lines starting with '#', under the assumption that such # lines have been manually commented out. # if (/^#/) { goto skipline; } # # Look for what appears to be a timestamp, without being too picky. # Then verify its legality and monotonicity in detail. In strict mode, # die on any timestamp corruption. In non-strict mode, just whine # about the corruption, and skip this logfile line. Doesn't make sense # to try to continue process the line if the timestamp is fucqued. # $ts = ''; # Assume we won't find a valid timestamp. if (/(^[A-Z][a-z][a-z] \d\d \d\d:\d\d:\d\d)/) { $ts = $1; # rpt("line $lnum: ts = '$ts'\n"); if (validate_ts($ts) != 0) # If timestamp corrupted... { if ($STRICT > 0) { die "? Corrupted timestamp, line $lnum\n"; } rpt("* Corrupted timestamp, line $lnum; line ignored.\n"); goto skipline; } # # If we have a previous time stamp, verify present one is later. # NOTE: Perl string comparison is used for the date comparison, # because delsec (which uses Date::Manip) is UFS. Seems like this # should be safe, since the timestamp subfields are ordered # appropriately for string comparison, but no guarantees. # if ($prevts and ($ts lt $prevts)) { $t = sprintf("Timestamp non-monotonic, line %5d: [%s, %s]\n", $lnum, $prevts, $ts); if ($STRICT > 1) { die "? $t"; } if ($RPT_NONMON) { rpt("* $t"); } } $prevts = $ts; } # # Also skip the line if we couldn't even find a timestamp. Every type # of line we're interested in oughta have one. If we can't find it # (or if it's so corrupted we can't recognize it) flux it. # if ($ts eq '') { goto skipline; } # # Boot sequence start message (identified by RE $::re_bssta). # if (m/$::re_bssta/) { # rpt("%6d: $_",$lnum); # # If we were already processing a bootseq event when we saw this # "bootseq start" message, set the present bootseq (the one we're # already processing) termination code to 'nested', for later # handling and ignoration. This situation is indicative of a # malformed logfile in which the present bootseq was either not # terminated as expected or the termination message was lost. # if ($in_bs == 1) { if ($::RPT_PASSINF) { rpt(" line %6d: bssta nested, terminating this " . "bootseq\n", $lnum); rpt("\n"); } $t = $lidx-1; push(@bs, "$t nested"); # Termcode for outer bootseq $in_bs = 0; # Not in a bootseq anymore redo; # Let's reprocess it } # # Some systems exec systemd both during the initrd phase of bootup # and then again later after real / is mounted. For our purposes, # we wish to ignore the initrd instances of systemd and wait for # the "real" one after / mount. Fortunately, it seems that at least # some versions of systemd (205+?) announce when they are running # in initrd mode by issuing a message "Running in initial RAM disk" # just after starting. So look for this message over the next 10 # lines (10 = arbitrary value) and if we see it, ignore this bssta # message. # if (grep(/Running in initial RAM disk/, @log[$lidx+1..$lidx+10])) { if ($::RPT_PASSINF) { rpt(" line %6d: bssta during initrd phase, ignored\n", $lnum); } goto skipline; } if ($::RPT_PASSINF) { rpt(" line %6d: bssta after real root mount, " . "accepted as T0\n", $lnum); } # # In either case (already inside a bootseq or not) push the # line number and "bssta" initcode onto @bs, and indicate that # we are now inside a bootseq. # push(@bs, "$lidx bssta"); $in_bs = 1; # "1" means we're inside a real bootseq now. } # # Boot sequence end message. # elsif (m/$::re_bsend/) { # rpt("%6d: $_",$lnum); $::RPT_PASSINF && rpt(" line %6d: ", $lnum); if ($in_bs == 0) # Not in a bootseq, consider it spurious { if ($::RPT_PASSINF) { rpt("spurious bsend, ignored\n\n"); } } elsif ($in_bs == 1) # If we're inside a bootseq already... { if ($::RPT_PASSINF) { rpt("bsend in bootseq as anticipated, accepted as T1\n\n"); } push(@bs, "$lidx bsend");# ...show ending line # and norm exit } elsif ($#bs == -1) # If no start message seen yet, ok... { if ($::RPT_PASSINF) { rpt("* Ignoring early bsend at line $lnum\n"); } } $in_bs = 0; # We're not inside a bootseq anymore. } # # Bottom of loop processing: Flow from above or skip here for logfile # lines we can't (or choose not to) process. # skipline: { ++$lidx; ++$lnum; } } # # If $in_bs is still set, then the last bootseq event in the logfile never # terminated before the log was closed. This bootseq event is ignored: # Report it, then pop off the last 'bootseq start' message, so we don't get # confused by it when doing the analysis later. # if ($in_bs) { rpt("* Last bootseq continues past end of logfile, " . "bootseq ignored.\n"); pop(@bs); # Pop it off, forget about it. } # # Total number of bootseqs, $nbs_tot, is half the number of entries in @bs, # since @bs (should) contain one starting message and one terminating # message per bootseq. So verify $nbs_tot is even, die if not. # $nbs_tot = ($#bs+1)/2; if (int($nbs_tot) != $nbs_tot) # Better be an even # of start+end recs { die "? AUUUGH! Hi-level parse inconsistency, aborting\n"; } if ($nbs_tot == 0) { die("? No bootseq events found\n"); } rpt("Pass 1: Complete. $nbs_tot total boot sequences identified\n"); rpt("\n"); } # # Mid-level parse (Passes 2 and 3): # # Pass 2: # Loop over all bootseqs identified in Pass 1, and filter out only those # which should participate in the subsequent analysis. Bootseqs eliminated # in this filtering process are those intentionally ignored per the "-ib" # cmdline option, and those for which some boot-time error or other # peculiarity was noticed during parsing. # # Then, for each of the filtered bootseqs, extract all encountered MOIs # into a global hash %::moihash. Thus, when Pass 2 is complete, %::moihash # comprises the superset of all MOIs over all analyzed bootseqs: The keys # are the MOI message texts and the values are the counts, which are all # set to 0. (The counts are used in Pass 3, see below.) # # Pass 3: # Loop over all the bootseqs selected in Pass 2, and perform a diagnostic # analysis of each one. Presently that analysis comprises making a per- # bootseq copy of %::moihash and then incrementing the count of every MOI # encountered within the bootseq. Thus, when complete, any MOI which has # a zero count in its private %::moihash copy is a candidate for having # been dropped, since a zero count implies that the message had been # encountered in some other bootseq, but not in this one. # sub mid_level_parse { use strict; my $debug = 0; # 1=dump %::moihash my ($i, $firstline, $lastline); # # Pass 2: Filter bootseqs and create %::moihash. # rpt("Pass 2: Filter bootseqs and form superset list of MOIs from those.\n"); for ($i = 0; $i < $::nbs_tot; ++$i) # Loop over bootseqs { $firstline = $::bs[2*$i]; # First line of $i-th bootseq $lastline = $::bs[2*$i + 1]; # Last line of $i-th bootseq $::bsrec[$i] = {}; # Tabula rasa extract_msgs($firstline, $lastline, $i); } # # Number of analyzed (i.e. filtered) bootseqs. # $::nbs_ana = $::nbs_tot - ($::nbs_ign+$::nbs_pec+$::nbs_sht+$::nbs_err+$::nbs_nst); # # Compute/report total number of messages of interest in the superset # message hash. # $::moi_tot = keys(%::moihash); rpt("Pass 2: Complete. %d/%d bootseqs accepted for analysis, " . "%d MOIs found\n", $::nbs_ana, $::nbs_tot, $::moi_tot); rpt("\n"); # # Debug: Dump some info about %::moihash # if ($debug) { my $t = %::moihash; # Used/allocated bucket info print "moihash: $t entries\n"; foreach (sort keys(%::moihash)) { print "$_\n"; } } # # Pass 3: Analyze each bootseq selected in Pass 2, looking for candidate # absent MOIs # rpt("Pass 3: Identify MOIs not present in each analyzed bootseq\n"); for ($i = 0; $i < $::nbs_tot; ++$i) # Loop over bootseqs { $firstline = $::bs[2*$i]; # First line of $i-th bootseq $lastline = $::bs[2*$i + 1]; # Last line of $i-th bootseq analyze_bootseq($firstline, $lastline, $i); } rpt("Pass 3: Complete. "); rpt("%d bootseqs analyzed\n", $::nbs_ana); rpt("\n"); } # # Examine bootseq passed by caller, and determine whether it should be selected # for analysis or not. If selected, extract all its MOIs into %::moihash. # sub extract_msgs { use strict; my ($l0, $l1, $bsidx) = @_;# Args: (startln, endln, idx into @bsrec) my ($idx0, $idx1); # Beginning/ending line indices for this bootseq my $t; # Scratch my ($ln); # Current line number, 0-origin my ($msg); # Extracted msg text my $rp = \$::bsrec[$bsidx];# Shorthand ptr to our bsrec[] entry # # Copy %::moihash to our bsrec, so it's guaranteed to be present even if # this bootseq winds up being ignored (below). # $$rp->{mh} = %::moihash; # # Logfile line numbers and init/exit codes for this bootseq: $l0 contains # the starting line number and "init code", which tells how the bootseq # event was initiated. $l1 contains the ending line number followed by # an "exit code", which tells how the bootseq event was terminated. # ($$rp->{l0}, $$rp->{initcode}) = split(' ', $l0); ($$rp->{l1}, $$rp->{termcode}) = split(' ', $l1); # # Convenience vars for the logfile line indices delimiting this bootseq. # $idx0 = $$rp->{l0}; $idx1 = $$rp->{l1}; # # From the bootseq start message, extract the start wallclock timestamp # into $$rp->{t0} # $::log[$idx0] =~ m/$::re_tstmp/; $$rp->{t0} = $1; # # From the bootseq end message, extract the finish wallclock timestamp # into $$rp->{t1}. # $::log[$idx1] =~ m/$::re_tstmp/; $$rp->{t1} = $1; # # $$rp->{wcdur} is the wallclock-time difference between t0 and t1, i.e. # the difference between the wallclock timestamps of the messages we use # to identify the starting and ending log entries of this bootseq. This # interval should be roughly similar to the "user" boot time reported by # systemd, since our "start of bootseq" message is just the first message # issued by systemd, which occurs fairly close to the start of the usermode # portion of the boot. # $$rp->{wcdur} = delsec($$rp->{t0}, $$rp->{t1}); # # Check bootseq number against various conditions for which we want to # ignore it for analysis. To start with, assume none of those reasons. # $$rp->{ign_cmd} = 0; $$rp->{ign_sht} = 0; $$rp->{ign_err} = 0; $$rp->{ign_pec} = 0; $$rp->{ign_nst} = 0; # First check this bootseq against the list of bootseqs to ignore, per # cmdline "-ib" option. If it's in that list, mark it as ignored for # this reason and go away. # if (inrange($bsidx+1, @::bsignlist)) # If bootseq in the "ignore" list... { $$rp->{ign_cmd} = 1; # Mark it ignored. ++$::nbs_ign; # Bump the appropriate count if ($::RPT_PASSINF) { rpt(" Ignoring bootseq #%03d (excluded by cmdline)\n", $bsidx+1); } return; # Nothing else to do. } # # During Pass 1 if we detected that this bootseq was aborted due to error, # bump the associated count of errored bootseqs and return. # if ($$rp->{termcode} eq 'bserr') { $$rp->{ign_err} = 1; # Ignore it. ++$::nbs_err; # Bump the appropriate count if ($::RPT_PASSINF) { rpt(" Ignoring bootseq #%03d (error during bootseq)\n", $bsidx+1); } return; } # # Like above, except if we had detected that this bootseq was terminated # due to nesting. # if ($$rp->{termcode} eq 'nested') { $$rp->{ign_nst} = 1; # Ignore it. ++$::nbs_nst; # Bump the appropriate count if ($::RPT_PASSINF) { rpt(" Ignoring bootseq #%03d (bootseq nested)\n", $bsidx+1); } return; } # # Like above, if this bootseq is suspiciously short. # if ($idx1 - $idx0 < $::TOO_SHORT) { $$rp->{ign_sht} = 1; # Ignore it. ++$::nbs_sht; # Bump the appropriate count if ($::RPT_PASSINF) { rpt(" Ignoring bootseq #%03d (bootseq too short)\n", $bsidx+1); } return; } # # Like above, if this bootseq is too peculiar. # TODO: Add peculiarity test # if (0) { $$rp->{ign_pec} = 1; # Ignore it. ++$::nbs_pec; # Bump the appropriate count if ($::RPT_PASSINF) { rpt(" Ignoring bootseq #%03d (bootseq too peculiar)\n", $bsidx+1); } return; } # # If we made it here, then this bootseq is accepted for analysis. Harvest # all its MOIs into %::moihash. # for ($ln = $idx0; $ln <= $idx1; ++$ln) { $_ = $::log[$ln]; chomp; # rpt("line $ln: '$_'\n"); # # Start out assuming this line doesn't contain a MOI. # $msg = ''; # # Skip lines starting with '#', under the assumption that such lines # have been commented out by a human for de-corruption purposes. # if (/^#/) { next; } # # Skip lines specified in -il=lnignore commandline option. # if (inrange($ln, @::lnignlist)) { next; } # # "Starting..." messages issued by systemd. Set $msg to the msg text # if (/$::re_stng/) { $msg = $1; } # # "Started..." messages issued by systemd. Set $msg to the msg text # elsif (/$::re_sted/) { $msg = $1; } # # If this line contained a MOI, add it to %::moihash. The hash value # is explicitly set to zero so we can depend on this assumption when # we later make per-bootseq copies of %::moihash in order to count # message occurrences within each bootseq. # if ($msg) { $::moihash{$msg} = 0; # rpt("smsg: $msg\n"); } } } # # Analyze the bootseq passed by caller: Identify and count MOIs encountered # within that bootseq and increment the counts in a per-bootseq copy of # %::moihash. (That copy is maintained in our bsrec[] entry $::bsrec[$idx], # where $idx passed in by the caller). Thus, upon return, that hash copy is # a fully populated record of all MOIs, both encountered and not-encountered # in this bootseq: A zero count means that the associated MOI -- i.e. the hash # key -- did not occur between T0 and T1 in this bootseq, and is therefore a # candidate dropped message. # sub analyze_bootseq { use strict; my ($l0, $l1, $idx) = @_; # Args: (startln, endln, idx into @bsrec) my $t; # Scratch my $ln; # Current line number, 0-origin my $msg; # Extracted message text my $rp = \$::bsrec[$idx]; # Shorthand ptr to our bsrec[] entry my %mh = %::moihash; # Copy of %::moihash pertaining to this bootseq # # Assume no MOIs present. # $$rp->{moi_present} = 0; # # Don't analyze bootseqs that were filtered out in Pass 2. # if ($$rp->{ign_cmd}) { return; } # Ignored per cmdline "-ib" option if ($$rp->{ign_err}) { return; } # Ignored due to boot-time error if ($$rp->{ign_sht}) { return; } # Ignored because suspiciously short if ($$rp->{ign_pec}) { return; } # Ignored because too peculiar if ($$rp->{ign_nst}) { return; } # Ignored because nested # # Ok, get here if this bootseq is accepted for analysis. Examine each # line therein, looking for MOIs. # for ($ln = $$rp->{l0}; $ln <= $$rp->{l1}; ++$ln) { $_ = $::log[$ln]; chomp; # rpt("line $ln: '$_'\n"); # # Start out assuming this line doesn't contain a MOI. # $msg = ''; # # Skip lines starting with '#', under the assumption that such lines # have been commented out by a human for de-corruption purposes. # if (/^#/) { next; } # # Skip lines specified in -il=lnignore commandline option. # if (inrange($ln, @::lnignlist)) { next; } # # "Starting..." reports # if (/$::re_stng/) { $msg = $1; } # # "Started..." reports # elsif (/$::re_sted/) { $msg = $1; } # # If line contained a message of interest, verify it appears in # %mh, and increments its hash value (which is effectively a count # of the number of occurrences of this message in this bootseq). # if ($msg) { if (!defined($mh{$msg})) { die "? internal error: message '$msg' not in hash copy\n"; } $mh{$msg}++; # rpt("added $msg to bootseq %d\n", $idx+1); } } # # %mh now contains the counts of each MOI found in this bootseq, copy # it to our @bsrec entry. # $$rp->{mh} = \%mh; # # Count the total number of MOIs pressent in this bootseq. Note that # if a each present MOI is counted only once, even if it occurs # multiple times. # foreach (values($$rp->{mh})) { if ($_ > 0) { $$rp->{moi_present}++; } } } # # Barf some basic high-level info to the reportfile. # sub barf_info { $t = versinfo(); rpt("\n$t\n"); rpt("parsing strictness level: $STRICT\n"); # # Report header: Version info, some key parms. # printf($rptfh "#\n# Generated by $t at $rptdate\n#\n"); printf($rptfh "# commandline args:\t\t$cmdargs\n"); printf($rptfh "# bootseq ignore list:\t\t%s\n", $IBSPEC eq '' ? 'None' : $IBSPEC); printf($rptfh "# line number ignore list:\t%s\n", $ILSPEC eq '' ? 'None' : $ILSPEC); printf($rptfh "# logfile:\t\t\t$logfn\n"); printf($rptfh "# report file:\t\t\t$rptfn\n"); printf($rptfh "# parsing strictness level:\t$STRICT\n"); if ($RPT_SYSINF) { printf($rptfh "# system datagram qlen:\t\t"); if ($sys_dgram_qlen) { printf($rptfh "$sys_dgram_qlen\n"); } else { printf($rptfh "(cannot read $QLEN_FILE)\n") } } printf($rptfh "#\n\n"); # Separator between header and body } # # Misc sanity checks and warning issuances on what we've found so far. # sub sanity_checks { # # FIXME: Any reasonable tests here? # } # # Per-bootseq reporting: Go thru the list of bootseqs, analyze each one and # report the details of each to the reportfile. # sub bootseq_rpt { my $t; # Scratch my $rp; # Ptr to per-bootseq @bsrec entry my ($l0, $l1); # Shorthand for bsrec[]->{l0}, {l1} my ($l0msg, $l1msg); # Extracted msg text from l0, l1 my $ver; # systemd version extracted from $l0 my %mh; # Convenience copy of $$rp->{mh} my $bsnum; # Convenience var: Bootseq #, 1-origin my $droprate; # Per-bootseq estimated msg drop rate for ($i = 0; $i < $nbs_tot; ++$i) # Loop over bootseqs { $bsnum = $i + 1; # Human style bootseq number $rp = \$bsrec[$i]; $l0 = $$rp->{l0}; # Shorthand $l1 = $$rp->{l1}; # # Extract message text and systemd version from bootseq start message # $::log[$l0] =~ m/$::re_bssta/; $l0msg = substr($1,0,50) . '...'; $ver = $2; # # Extract message text from bootseq end message. # $::log[$l1] =~ m/$::re_bsend/; $l1msg = substr($1,0,50) . '...'; %mh = %{$$rp->{mh}}; # Convenience copy # # Estimated per-bootseq message drop rate is the "gap" between the # superset total MOI count and the number of MOIs in this bootseq, # normalized by the total count. # $droprate = 100*($::moi_tot - $$rp->{moi_present}) / $::moi_tot; # # Report the basic info for this bootseq. # printf($rptfh " Bootseq #%03d:\t\tLogfile lines %d - %d (%d lines)\n", $bsnum, $l0+1, $l1+1, $l1-$l0+1); printf($rptfh " Systemd version:\t$ver\n"); printf($rptfh " T0:\t\t\t$$rp->{t0}\n"); printf($rptfh " \t\t\t\"$l0msg\"\n"); printf($rptfh " T1:\t\t\t$$rp->{t1}\n"); printf($rptfh " \t\t\t\"$l1msg\"\n"); printf($rptfh " T1-T0:\t\t%d s\n", $$rp->{wcdur}); # # Header for the "report counts" line, which we fill in below # as we go. It might say "none", or it might have the actual numbers. # printf($rptfh " MOIs present:\t"); # # If bootseq flagged for ignoration, report the reason and skip it. # if ($$rp->{ign_cmd}) { printf($rptfh "None (bootseq ignored per cmdline)\n\n\n"); next; } if ($$rp->{ign_sht}) { printf($rptfh "None (bootseq too short)\n\n\n"); next; } if ($$rp->{ign_err}) { printf($rptfh "None (error during bootseq)\n\n\n"); next; } if ($$rp->{ign_pec}) { printf($rptfh "None (bootseq too peculiar)\n\n\n"); next; } if ($$rp->{ign_nst}) { printf($rptfh "None (bootseq nested)\n\n\n"); next; } # # Report number of MOIs present in this bootseq/total number of MOIs, # and the drop rate. # printf($rptfh "%d/%d\n", $$rp->{moi_present}, $::moi_tot); printf($rptfh " Est. drop rate:\t%.0f%%\n", $droprate); # # Report text of each absent MOI # printf($rptfh " Missing MOIs:\n"); # # Report the text of all MOIs which were not present in this bootseq. # foreach (sort(keys(%mh))) { if ($mh{$_} == 0) { printf($rptfh "\t\t\t%s\n", substr($_,0,45)); } } printf($rptfh "\n\n"); printf($rptfh "\n"); if ($::RPT_PASSINF) { rpt(" Bootseq #%03d: %d/%d MOIs present, drop rate ~= %3.0f%%\n", $bsnum, $$rp->{moi_present}, $::moi_tot, $droprate); } } } # # Issue summary report. # sub session_rpt { use strict; printf($::rptfh "Summary:\n"); printf($::rptfh " Total bootseq events:\t\t%4d\n", $::nbs_tot); printf($::rptfh " Ignored (per cmdline): \t%4d\n", $::nbs_ign); printf($::rptfh " Ignored (nested): \t%4d\n", $::nbs_nst); printf($::rptfh " Ignored (too peculiar):\t%4d\n", $::nbs_pec); printf($::rptfh " Ignored (too short): \t%4d\n", $::nbs_sht); printf($::rptfh " Ignored (boot error): \t%4d\n", $::nbs_err); printf($::rptfh " Analyzed: \t%4d\n", $::nbs_ana); } # # Initialize main::* vars # sub initialize { # # Cmdline parm defaults. # $STRICT = 1; # Strictness level $RPT_NONMON = 0; # Don't report non-mono timestamps. $RPT_PASSINF = 0; # Don't report detailed per-pass info # # Misc constants # $TOO_SHORT = 100; # Reject bootseqs with fewer than this # lines # # Pseudo-file containing this system's max datagraph queue length # parameter, as reported by -s option. # $QLEN_FILE = '/proc/sys/net/unix/max_dgram_qlen'; # # Misc inits # @bsignlist = (); # List of bootseqs to ignore, from cmdline. @lnignlist = (); # List of line #'s to ignore, from cmdline. $nbs_tot = 0; # Total # of bootseq's found. $nbs_ana = 0; # # bootseqs accepted for analysis $nbs_sht = 0; # # bootseqs too short to analyze $nbs_pec = 0; # # bootseqs too peculiar to analyze $nbs_err = 0; # # bootseqs aborted due to boot error $nbs_nst = 0; # # bootseqs terminated due to nesting. $nbs_ign = 0; # # bootseqs ignored per cmdline # # Pull/chomp current date before setting INPUT_RECORD_SEPARATOR for DOS. # $rptdate = `date`; chomp $rptdate; # # Assume unix line endings for journalctl output processing. # $INPUT_RECORD_SEPARATOR = "\n"; # Var name assumes "use English" # # RE for extracting wallclock timestamps. Just the first 15 chars. # $re_tstmp = '(^.{15})'; # # RE used to identify message we interpret as the start of a bootseq. # $1 is the leading part of the message; $2 is the reported systemd version. # $re_bssta = '(systemd (\d+) running in.*)'; # # # RE used to idenfity the end of a boot sequence. # $re_bsend = '(Startup finished in.*)'; # # RE for "Starting..." messages issued by systemd. $1 is the msg text # $re_stng = '^.*systemd\[\d+\].*(Starting\s*.*$)'; # # RE for "Started..." messages issued by systemd. $1 is the msg text # $re_sted = '^.*systemd\[\d+\].*(Started\s*.*$)'; } # # Usage barf # sub usage { my $t; $t = "\n" . "usage:\n" . " jcdiag [-h] [-v] [-m] [-ib=bsignore] [-il=lnignore] logfile\n" . " -h This help text\n" . " -v Verbosely report info from each pass to stderr\n" . " -m Report non-monotonic timestamps.\n" . " -s Report miscellaneous system info of interest\n" . " -ib Ignore bootseq numbers specified in 'bsignore'.\n" . " Example: '-ib=3-7,10,19' [No spaces allowed]\n" . " -il Ignore logfile line numbers specified in 'lnignore'.\n" . " Example: '-il=30-70,100-192' [No spaces allowed]\n"; rpt($t); rpt("\n"); exit(-1); } # # Cmdline parse. # sub parse_cmdline { # # Assume -ib and -il options absent, hence associated range specs = null. # $IBSPEC = ''; $ILSPEC = ''; $cmdargs = join(' ', @ARGV); Getopt::Long::Configure('no_ignore_case'); # Option tags case sensitive Getopt::Long::Configure('require_order'); # Require opts before other args GetOptions ( "ib=s" => \$opt_ib, "il=s" => \$opt_il, "m" => \$opt_m, "v" => \$opt_v, "h" => \$opt_h, "s" => \$opt_s, ) or exit(-1); if (@ARGV != 1) { usage(); } if ($opt_h) { usage(); } if ($opt_v) { $RPT_PASSINF = 1; } if ($opt_m) { $RPT_NONMON = 1; } if ($opt_s) { $RPT_SYSINF = 1; } if ($opt_ib) { $IBSPEC = $opt_ib; my @ign = intspec_canon($IBSPEC, 10000); if ($ign[0] < 0) { die "? bsignore range spec error $ign[0]\n"; } @bsignlist = @ign; } if ($opt_il) { $ILSPEC = $opt_il; my @ign = intspec_canon($ILSPEC, 10000000); if ($ign[0] < 0) { die "? lnignore range spec error $ign[0]\n"; } @lnignlist = @ign; } # rpt("bsignlist: '@bsignlist'\n"); # rpt("lnignlist: '@lnignlist'\n"); # # Logfile name ($logfn) from cmdline. Report and dump output filenames # are the basename of $logfn concatenated with appropriate extension. # $logfn = $ARGV[0]; $rptfn = fileparse($logfn, qw{\.[^.]*}) . '.rpt'; # Report # # Open the report file so on return, mainline can start using it. # open($rptfh, ">$rptfn") or die "? unable to create $::rptfn\n"; # # If -s option, collect some info about the execution environment. # if ($RPT_SYSINF) { # See if we can actually read $QLEN_FILE. If not, warn and set the # value to null string so we know later. if (-r $QLEN_FILE) { $sys_dgram_qlen = `cat $QLEN_FILE`; chomp $sys_dgram_qlen; } else { $sys_dgram_qlen = ''; rpt("WARNING: Cannot read $QLEN_FILE for -s option\n"); } } } # # Format and return RCS version info string. # sub versinfo { use strict; my ($progname, $version, $locker); my $verstring; ($progname = '$RCSfile: jcdiag,v $') =~ s/.RCSfile: //; $progname =~ s/,v .$//; ($version = '$Revision: 1.39 $') =~ s/.Revision: //; $version =~ s/ .$//; ($locker = '$Locker: $') =~ s/.Locker: //; $locker =~ s/ .$//; $verstring = sprintf("$progname v$version"); if ($locker !~ /^\s*$/) { $verstring .= sprintf(" (locked by $locker)"); } return $verstring; } # # Slurp logfile into @::log # sub slurp_logfile { my $logfh; if (!open($logfh, $logfn)) { die "? unable to open $logfn\n"; } rpt("reading $logfn... "); @log = <$logfh> or die; rpt("%d lines\n", $#log+1); rpt("\n"); } # # Clean up after a run. # sub cleanup { # # Close report and data dump files. # close($::rptfh); } # # Validate a timestamp of the form Mmm dd hh:mm:ss. Zero return = OK, else # error. # sub validate_ts { use strict; my ($ts) = @_; my ($tslen); my ($tim); my ($mo, $da); my ($hr, $mi, $se); my %molist = ('Jan', 1, 'Feb', 2, 'Mar', 3, 'Apr', 4, 'May', 5, 'Jun', 6, 'Jul', 7, 'Aug', 8, 'Sep', 9, 'Oct', 10, 'Nov', 11, 'Dec', 12); # # Basic length check: Mmm dd hh:mm:ss should be length 15 always. # $tslen = length($ts); if ($tslen != 15) { return -1; } # # Split into month, day, and time. Verify ' ' delimiters present first, # so don't attempt split() on malformed string. # if (substr($ts,3,1) ne ' ' or substr($ts,6,1) ne ' ') { return -2; } ($mo, $da, $tim) = split(' ', $ts); # # Validate month abbreciation. # if (!defined($molist{$mo})) { return -20;} # # Day of month range check # if ($da < 1 or $da > 31) { return -23; } # # Time: Basic syntax check # if (!($tim =~ /([0-2][0-9]):([0-5][0-9]):([0-5][0-9])/)) { return -30; } # # Time: Subfield range checks # ($hr, $mi, $se) = ($1, $2, $3); if ($hr < 0 or $hr > 23) { return -31; } if ($mi < 0 or $mi > 59) { return -32; } if ($se < 0 or $se > 59) { return -33; } return 0; } # # Shorthand for printf(STDERR ...); # sub rpt { printf(STDERR @_); } # # Return canonicalized rangelist array based on supplied intspec, or negative # scalar if intspec is invalid. # sub intspec_canon($$) { my ($spec, $hlim) = @_; # Parms: Spec and hi limit my ($r0, $r1); # First, last elements in range my @ret = (); # returned array if ($hlim < 0) { return -4; } # # Split the specification into fields, classify each, set $r0 and $r1 # accordingly, and then append the ($r0, $r1) pair to @ret. Note that # $r0 and $r1 will be the same value for single-number fields. # foreach (split(/,/, $spec)) # Loop over each field in the spec { if (/^[0-9]+$/) # Simple number { $r0 = $r1 = $_; } elsif (/^[0-9]+\-[0-9]+/) # Explicitly limited range { ($r0, $r1) = split(/\-/); } elsif (/^[0-9]+\-$/) # Implicitly upper-limited range { ($r0) = split(/\-/); # $r0 is the supplied left element $r1 = $hlim; # $r1 is $hlim } elsif (/^$/) # Null field. Just ignore it. { next; } else # Bzzzzt. { return (-1); # No need to continue, it's hosed } if ($r1 < $r0) # Ordering error { return (-2); } if ($r1 > $hlim) # Hi limit past the end, bag it { return (-3); } @ret = (@ret, $r0, $r1); # Append selection to @ret } return @ret; } # # NAME # delsec - difference in seconds (incl. fractional seconds) between two dates # # SYNOPSIS # $seconds = delsec($datestring0, $datestring1); # # DESCRIPTION # Return the difference in seconds, including fractional seconds, between # the supplied dates, with $datestring0 assumed to be the earlier. # # Tested with input dates in format of $ds0 and $ds1 above, and a few # variations thereof. Assumes that fractional seconds, if supplied in # either date, are right-anchored in the date string, without trailing # whitespace. # sub delsec { use Date::Manip; my ($ds0, $ds1) = @_; # Args: Caller-supplied free-form dates my ($fs0, $fs1) = ('0.0', '0.0'); # Fractional seconds: Default to "0.0" my ($date0, $date1); my ($udate0, $udate1); my $nargs = 1 + $#_; if ($nargs != 2) { die "? Internal error: delsec() called with $nargs args\n"; } # # Parse off the fractional seconds from each date, if there are any. # These are used later as a "correction" to the delta-date computed by # the difference between the Unix-epoch dates, since that delta doesn't # include fractional seconds. # $fs0 = ($ds0 =~ m/(\.[0-9]*$)/) ? $1 : '0.0'; $fs1 = ($ds1 =~ m/(\.[0-9]*$)/) ? $1 : '0.0'; #print "ds0 = $ds0, ds1 = $ds1\n"; #print "fs0 = $fs0, fs1 = $fs1\n"; # # Convert each date to Date::Manip internal format. # $date0 = ParseDate($ds0); $date1 = ParseDate($ds1); #print "'$date0'\n"; #print "'$date1'\n"; # # Convert internal format dates to epoch seconds. # $udate0 = UnixDate($date0, "%s"); $udate1 = UnixDate($date1, "%s"); #print "'$udate0'\n"; #print "'$udate1'\n"; # # Compute delta-epoch-seconds, including the fractional part. Note the # implicit string-numeric -> numeric conversion of the fractional part. # I love perl. # return ($udate1 + $fs1) - ($udate0 + $fs0); } # # Return nonzero if $n is within [assumedly canonicalized] @rangelist, # zero otherwise. If @rangelist is empty, return zero. # sub inrange { my ($n, @rangelist) = @_; # # For each pair of values in @rangelist, check if $n is in the interval. # This also correctly returns zero if @rangelist is empty. # for (my $i = 0; $i < @rangelist; $i += 2) { if ($n >= $rangelist[$i] and $n <= $rangelist[$i+1]) { return 1; } } return 0; } # # vim:filetype=perl #