head 1.7; access; symbols pkgsrc-2013Q2:1.7.0.46 pkgsrc-2013Q2-base:1.7 pkgsrc-2012Q4:1.7.0.44 pkgsrc-2012Q4-base:1.7 pkgsrc-2011Q4:1.7.0.42 pkgsrc-2011Q4-base:1.7 pkgsrc-2011Q2:1.7.0.40 pkgsrc-2011Q2-base:1.7 pkgsrc-2009Q4:1.7.0.38 pkgsrc-2009Q4-base:1.7 pkgsrc-2008Q4:1.7.0.36 pkgsrc-2008Q4-base:1.7 pkgsrc-2008Q3:1.7.0.34 pkgsrc-2008Q3-base:1.7 cube-native-xorg:1.7.0.32 cube-native-xorg-base:1.7 pkgsrc-2008Q2:1.7.0.30 pkgsrc-2008Q2-base:1.7 pkgsrc-2008Q1:1.7.0.28 pkgsrc-2008Q1-base:1.7 pkgsrc-2007Q4:1.7.0.26 pkgsrc-2007Q4-base:1.7 pkgsrc-2007Q3:1.7.0.24 pkgsrc-2007Q3-base:1.7 pkgsrc-2007Q2:1.7.0.22 pkgsrc-2007Q2-base:1.7 pkgsrc-2007Q1:1.7.0.20 pkgsrc-2007Q1-base:1.7 pkgsrc-2006Q4:1.7.0.18 pkgsrc-2006Q4-base:1.7 pkgsrc-2006Q3:1.7.0.16 pkgsrc-2006Q3-base:1.7 pkgsrc-2006Q2:1.7.0.14 pkgsrc-2006Q2-base:1.7 pkgsrc-2006Q1:1.7.0.12 pkgsrc-2006Q1-base:1.7 pkgsrc-2005Q4:1.7.0.10 pkgsrc-2005Q4-base:1.7 pkgsrc-2005Q3:1.7.0.8 pkgsrc-2005Q3-base:1.7 pkgsrc-2005Q2:1.7.0.6 pkgsrc-2005Q2-base:1.7 pkgsrc-2005Q1:1.7.0.4 pkgsrc-2005Q1-base:1.7 pkgsrc-2004Q4:1.7.0.2 pkgsrc-2004Q4-base:1.7 pkgsrc-2004Q3:1.6.0.16 pkgsrc-2004Q3-base:1.6 pkgsrc-2004Q2:1.6.0.14 pkgsrc-2004Q2-base:1.6 pkgsrc-2004Q1:1.6.0.12 pkgsrc-2004Q1-base:1.6 pkgsrc-2003Q4:1.6.0.10 pkgsrc-2003Q4-base:1.6 netbsd-1-6-1:1.6.0.6 netbsd-1-6-1-base:1.6 netbsd-1-6:1.6.0.8 netbsd-1-6-RELEASE-base:1.6 pkgviews:1.6.0.4 pkgviews-base:1.6 buildlink2:1.6.0.2 buildlink2-base:1.6 netbsd-1-5-PATCH003:1.6 netbsd-1-5-PATCH001:1.3; locks; strict; comment @# @; 1.7 date 2004.11.16.19.44.46; author abs; state dead; branches; next 1.6; 1.6 date 2001.12.17.12.01.27; author abs; state Exp; branches; next 1.5; 1.5 date 2001.12.12.13.20.32; author abs; state Exp; branches; next 1.4; 1.4 date 2001.08.09.12.35.36; author abs; state Exp; branches; next 1.3; 1.3 date 2001.04.24.09.00.26; author abs; state Exp; branches; next 1.2; 1.2 date 2001.02.15.11.32.01; author abs; state dead; branches; next 1.1; 1.1 date 2000.12.28.13.33.11; author abs; state Exp; branches; next ; desc @@ 1.7 log @*** empty log message *** @ text @$NetBSD: patch-aa,v 1.6 2001/12/17 12:01:27 abs Exp $ --- checkbot.pl.orig Mon Dec 17 10:55:56 2001 +++ checkbot.pl @@@@ -49,6 +49,7 @@@@ [B<--match> match string] [B<--exclude> exclude string] [B<--proxy> proxy URL] [B<--internal-only>] [B<--ignore> ignore string] [B<--file> file name] + [B<--skip> skip string] [B<--match-url-base>] [B<--style> style file URL] [B<--mailto> email address] [B<--note> note] [B<--sleep> seconds] [B<--timeout> timeout] @@@@ -95,6 +96,11 @@@@ underneath it, but not the HTML pages in the subdirectories of the server, the I would be "www.someserver.xyz/($|[^/]+.html)". + +=item --match-url-base + +This option causes checkbot to use the site component of each url when +determining which pages are local. =item --exclude @@@@ -102,6 +108,12 @@@@ even if they happen to match the I (See option C<--match>). The I can be a perl regular expression. + +=item --skip + +URLs matching the I are not processed. + +The I can be a perl regular expression. =item --ignore @@@@ -276,7 +288,7 @@@@ # Get command-line arguments use Getopt::Long; - my $result = GetOptions(qw(debug help verbose url=s match=s exclude|x=s file=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=i timeout=i interval=i dontwarn=s enable-virtual language=s)); + my $result = GetOptions(qw(debug help verbose url=s match=s exclude|x=s file=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=i timeout=i interval=i dontwarn=s enable-virtual language=s match-url-base skip|x=s)); # Handle arguments, some are mandatory, some have defaults &print_help if (($main::opt_help && $main::opt_help) @@@@ -287,6 +299,7 @@@@ $main::opt_interval = 10800 unless defined $main::opt_interval and length $main::opt_interval; $main::opt_dontwarn = "xxx" unless defined $main::opt_dontwarn and length $main::opt_dontwarn; $main::opt_enable_virtual = 0 unless defined $main::opt_enable_virtual; + $main::opt_match_url_base = 0 unless defined $main::opt_match_url_base; # Set the default language and make sure it is a two letter, lowercase code $main::opt_language = 'en' unless defined $main::opt_language; $main::opt_language = lc(substr($main::opt_language, 0, 2)); @@@@ -385,7 +398,11 @@@@ my @@matchurls; my $matchurl; foreach $matchurl (@@starturls) { - push(@@matchurls, quotemeta $matchurl); + $_ = $matchurl; + if ($main::opt_match_url_base && m#^(\w+://[^/]+/)#) { + $_ = $1; + } + push(@@matchurls, quotemeta $_); } $main::opt_match = '(' . join('|', @@matchurls) . ')'; print STDERR "--match defaults to $main::opt_match\n" if $main::opt_verbose; @@@@ -737,7 +754,9 @@@@ print OUT "--urlStart URL(s)", join(',', @@starturls), "\n"; print OUT "--matchMatch regular expression$main::opt_match\n"; + print OUT "--match-url-baseMatch base of each url$main::opt_match_url_base\n" if defined $main::opt_match_url_base; print OUT "--excludeExclude regular expression$main::opt_exclude\n" if defined $main::opt_exclude; + print OUT "--skipSkip regular expression$main::opt_skip\n" if defined $main::opt_skip; print OUT "--ignoreIgnore regular expression$main::opt_ignore\n" if defined $main::opt_ignore; print OUT "--dontwarnDon't warn for these codes$main::opt_dontwarn\n" if $main::opt_dontwarn ne 'xxx'; print OUT "--enable-virtualUse virtual names onlyyes\n" if $main::opt_enable_virtual; @@@@ -879,7 +898,7 @@@@ add_to_queue($url, $response->base); $doc_new++; } - } else { + } elsif (!defined $main::opt_skip || $url !~ /$main::opt_skip/o) { # Add this as an external link if we can check the protocol later if ($url =~ /^(http|ftp|gopher):/o) { print EXTERNAL $url . "|" . $response->base . "\n"; @@@@ -1007,8 +1026,12 @@@@ sub add_to_queue { my ($url, $parent) = @@_; - print QUEUE $url . '|' . $parent . "\n"; - $main::st_int[$main::TODO]++; + if (defined $main::opt_skip && $url =~ /$main::opt_skip/o) { + print STDERR "Skip $url\n" if $main::opt_verbose; + } else { + print QUEUE $url . '|' . $parent . "\n"; + $main::st_int[$main::TODO]++; + } } sub print_server { @@@@ -1204,7 +1227,9 @@@@ print " --url url Start URL\n"; print " --match match Check pages only if URL matches `match'\n"; print " If no match is given, the start URL is used as a match\n"; + print " --match-url-base Use the site part of the url in --match\n"; print " --exclude exclude Exclude pages if the URL matches 'exclude'\n"; + print " --skip skip Do not process pages if the URL matches 'skip'\n"; print " --ignore ignore Do not list error messages for pages that the\n"; print " URL matches 'ignore'\n"; print " --file file Write results to file, default is checkbot.html\n"; @@@@ -1219,7 +1244,7 @@@@ print " --enable-virtual Use only virtual names, not IP numbers for servers\n"; print " --language Specify 2-letter language code for language negotiation\n"; print "\n"; - print "Options --match, --exclude, and --ignore can take a perl regular expression\nas their argument\n\n"; + print "Options --match, --exclude, --skip, and --ignore can take a perl regular\nexpression as their argument\n\n"; print "Use 'perldoc checkbot' for more verbose documentation.\n\n"; print "Checkbot WWW page : http://degraaff.org/checkbot/\n"; print "Mail bugs and problems: checkbot\@@degraaff.org\n"; @ 1.6 log @Update checkbot to 1.67. Changes: * A --language option to ask the server for pages in other languages * Bug fixes related to URI package and non-standard server names * Some other minor bugfixes detailed in the ChangeLog * Added example for use of the --match argument @ text @d1 1 a1 1 $NetBSD$ @ 1.5 log @Update checkbot to 1.66. Changes: * checkbot.pl (get_headers): URI doesn't know about netloc, but it does know about authority. (get_headers): $url is already absolute, no need for ->abs * checkbot.pl (handle_doc): Print a notice when external non HTTP/FTP URLs are dropped. * checkbot.pl (init_modules and other places): Remove URI::URL::strict call and use of new URI::URL because it is obsolete, we should use the URI classes now. * checkbot.pl (init_globals): Initialize last checkpoint time with 0 instead of current time, so that we write out a set of pages right at the start. This will catch problems with permissions for these pages as early as possible. * checkbot.pl (get_server_type): Take into account that we might not learn anything about the server * checkbot.pl (get_headers): Factored out of check_external so that moving to using GET requests only will be easier later. * checkbot.pl (send_mail): Really fix printing of starting URLs in email. All URLs are now printed in the subject and body of the message. @ text @d3 1 a3 1 --- checkbot.pl.orig Thu Oct 25 20:46:42 2001 d13 5 a17 4 @@@@ -90,6 +91,11 @@@@ The I can be a perl regular expression. d22 1 a22 1 + d25 2 a26 2 URLs matching the I are considered to be external, @@@@ -97,6 +103,12 @@@@ d29 1 a29 1 d35 1 a35 1 + d38 1 a38 2 If a URL has an error, and matches the I, its error @@@@ -262,7 +274,7 @@@@ d42 2 a43 2 - my $result = GetOptions(qw(debug help verbose url=s match=s exclude|x=s file=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=i timeout=i interval=i dontwarn=s enable-virtual)); + my $result = GetOptions(qw(debug help verbose url=s match=s exclude|x=s skip|x=s file=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=i timeout=i interval=i dontwarn=s enable-virtual match-url-base)); d47 1 a47 1 @@@@ -273,6 +285,7 @@@@ d52 4 a55 4 # The default for opt_match will be set later, because we might want # to muck with opt_url first. @@@@ -362,7 +375,11 @@@@ d68 1 a68 1 @@@@ -709,7 +726,9 @@@@ d78 1 a78 1 @@@@ -851,7 +870,7 @@@@ d85 1 a85 1 if ($url =~ /^(http|ftp):/o) { d87 1 a87 1 @@@@ -972,8 +991,12 @@@@ d102 2 a103 1 @@@@ -1167,6 +1190,7 @@@@ d106 1 d112 1 a112 3 @@@@ -1179,8 +1203,9 @@@@ print " --interval seconds Maximum time interval between updates (default 10800)\n"; print " --dontwarn codes Do not write warnings for these HTTP response codes\n"; d114 1 a114 1 + print " --match-url-base Use the site part of the url in --match\n"; @ 1.4 log @Updated checkbot to 1.64nb1 Implement a --skip option @ text @d3 1 a3 1 --- checkbot.pl.orig Sun Apr 15 20:34:30 2001 d5 1 a5 3 @@@@ -47,8 +47,9 @@@@ checkbot [B<--debug>] [B<--help>] [B<--verbose>] [B<--url> start URL] d7 3 a9 5 - [B<--proxy> proxy URL] [B<--internal-only>] - [B<--ignore> ignore string] [B<-file> file name] + [B<--skip> skip string] [B<--ignore> ignore string] + [B<--proxy> proxy URL] [B<--internal-only>] [B<--match-url-base>] + [B<--file> file name] d55 1 a55 1 @@@@ -353,7 +366,11 @@@@ d68 1 a68 1 @@@@ -781,7 +798,9 @@@@ d78 1 a78 1 @@@@ -923,7 +942,7 @@@@ d87 1 a87 1 @@@@ -947,8 +966,12 @@@@ d102 1 a102 1 @@@@ -1142,6 +1165,7 @@@@ d110 1 a110 1 @@@@ -1154,8 +1178,9 @@@@ @ 1.3 log @Update checkbot to 1.64 Changes in this version include: - Fix printing of starting URLs in email - Removed duplicate header in report - Try more environment variables to set temporary directory - Avoid using printf on pipes, and fix silly typo. Also add --match-url-base @ text @d5 1 a5 1 @@@@ -47,7 +47,7 @@@@ d10 2 d13 1 a13 1 [B<--ignore> ignore string] [B<-file> file name] d16 2 a17 1 @@@@ -90,6 +90,11 @@@@ d29 14 a42 1 @@@@ -262,7 +267,7 @@@@ d47 1 a47 1 + my $result = GetOptions(qw(debug help verbose url=s match=s exclude|x=s file=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=i timeout=i interval=i dontwarn=s enable-virtual match-url-base)); d51 1 a51 1 @@@@ -273,6 +278,7 @@@@ d59 1 a59 1 @@@@ -353,7 +359,11 @@@@ d72 1 a72 1 @@@@ -781,6 +791,7 @@@@ d78 1 d81 34 a114 1 @@@@ -1154,6 +1165,7 @@@@ d120 2 a121 1 print "Options --match, --exclude, and --ignore can take a perl regular expression\nas their argument\n\n"; d123 2 @ 1.2 log @Update to 1.63: Changes since 1.62: - Require LWP 5.50. A bug fix was introduced in LWP 5.49 which solves problems with relative Location: headers for Checkbot. - Create a defaults --match argument based on all the start URLs, not just the first one. - Remove .bak files when the new files are written correctly. - Create correct URLs when --file argument also contains directories. - Deal with redirects without a Location: header. - Don't exclude checkbot's own pages automagically. - Always ask server about file type for HTTP requests when uncertain. - Make output well-formed HTML. - Several typo's and other output fixes. patch-aa incorporated into main dist. @ text @d1 1 a1 1 $NetBSD: patch-aa,v 1.1 2000/12/28 13:33:11 abs Exp $ d3 1 a3 1 --- checkbot.pl.orig Sun Sep 24 14:51:46 2000 d5 8 a12 1 @@@@ -51,7 +51,7 @@@@ d14 3 a16 10 [B<--note> note] [B<--sleep> seconds] [B<--timeout> timeout] [B<--interval> seconds] [B<--dontwarn> HTTP responde codes] - [B<--enable-virtual>] + [B<--enable-virtual>] [B<--no-bak>] [start URLs] =head1 DESCRIPTION @@@@ -180,6 +180,10 @@@@ causes problems, which this feature work around by using the hostname to distinguish the server. d18 1 a18 1 +=item --no-bak d20 2 a21 1 +Do not save previous summary pages with .bak extensions. d23 1 a23 1 =back d25 2 a26 2 =head1 PREREQUISITES @@@@ -255,7 +259,7 @@@@ d31 1 a31 1 + my $result = GetOptions(qw(debug help verbose url=s match=s exclude|x=s file=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=i timeout=i interval=i dontwarn=s enable-virtual no-bak)); d35 1 a35 1 @@@@ -266,6 +270,7 @@@@ d39 1 a39 1 + $main::opt_no_bak = 0 unless defined $main::opt_no_bak; d43 19 a61 10 @@@@ -698,7 +703,7 @@@@ print STDERR "*** Start writing results page\n" if $main::opt_verbose; open(OUT, ">$main::file.new") - || die "$0: Unable to open $main::file.bak for writing:\n"; + || die "$0: Unable to open $main::file.new for writing:\n"; print OUT "\n"; if (!$final_page) { printf OUT "\n", @@@@ -762,6 +767,7 @@@@ d64 1 a64 17 print OUT "--enable-virtualUse virtual names onlyyes\n" if $main::opt_enable_virtual; + print OUT "--no-bakDo not save previous summaries in .bak filesyes\n" if $main::opt_no_bak; print OUT "--internal-onlyCheck only internal linksyes\n" if defined $main::opt_internal_only; print OUT "\n"; @@@@ -772,7 +778,9 @@@@ close(OUT); - rename($main::file, $main::file . ".bak"); + unless ($main::opt_no_bak) { + rename($main::file, $main::file . ".bak"); + } rename($main::file . ".new", $main::file); print STDERR "*** Done writing result page\n" if $main::opt_verbose; @@@@ -1121,6 +1129,7 @@@@ d68 1 a68 1 + print " --no-bak Do not save previous summaries in .bak files\n"; @ 1.1 log @Update to 1.62nb1, fix type in error message and add --no-bak option - patch submitted back to maintainer. @ text @d1 1 a1 1 $NetBSD$ @