Make XML sitemap off-line with perl

Disclaimer:
The use of this code is at your own risk.

Description:
Create an XML sitemap for google.
Google does use xml sitemaps to index websites.
It just is a list of url's of your website in format.
See https://www.google.com/sitemaps/protocol.html
This version tries to add the <lastmod></lastmod> tag scanning the url on the Internet.

How to use:
1: Be sure perl language is installed on your W32 system.
http://strawberryperl.com/
2: Put this script in the root directory of your site.
https://comweb.nl/perl/makeSitemapXML/makeSitemapXML.zip
Need to unzip it contains this instruction and the makeSitemapXML.pl file
Not verget to edit line 16
3: open the comand prompt and cd to this directory.
4: execute perl makeSitemapXML.pl > mySitemapName.xml
5: upload the mySitemapName.xml to your Internet site.
6: Register the mySitemapName.xml at google for indexing your site.

Purpose:
I wrote this because I could not fine any usefull tools for easily creating xml sitemaps
My site does contain more than 800 html links
It is just a quick fix
Later added the w3ctime subroutine that tries to <lastmod></lastmod> tag
\Performance:
It seems to work perfectly.
But is a beta
Disadvantage: be carefull to not upload more than needed from your hard drive.
It indexes curent and all directories above
This version tries to add the last modified tag.
If it can not add the last modified tag it adds a warning comment to the xml sitemap file.

Donation:
If you want to support my work, thanks

Source Code description:
Will add this later
See the instructions in the source file
   
1: #!/usr/bin/perl
2:  
3:    use strict;
4:    use warnings;
5:    use Cwd 'abs_path';
6:    use POSIX;
7:    use LWP::UserAgent;
8:  
9:    # (c)2019 Comweb NL All RIGHTS RESERVED
10:    # RJHMM van den Bergh sales2@comweb.nl
11:    # use at own risk, no liability
12: 
13:   # usage cd to/Your/HTML/root/direcory/on/your/win32/system
14:   # perl makeXML.pl > myXMLmap.xml
15:   
16:   # EDIT THIS ONE IF NEEDED
17:   my $prefix="https://www.comweb.nl";
18: 
19:   print <<HEADER;
20:   
21:   <urlset
22:       xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
23:       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
24:       xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 
25:          http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
26:            
27:   <!-- Created by Perl script provided by Comweb Netherlands -->
28:   <!-- (c)Comweb.nl RJHM van den Bergh -->
29:   <!-- Free for non commercial use -->
30:             
31: HEADER
32:   
33:   my $startlocationDirectory=abs_path(".");
34:   my $level=0; # directory level  
35:  
36:   getdir($startlocationDirectory); 
37:   
38:   sub getdir
39:   {
40:       my $dir = shift;
41:       my $path="";
42:       opendir (my $dh, $dir) || die "Cannot opend $dir $!";
43:  
44:       while( my $file = readdir($dh)) 
45:       { 
46:           next if ($file =~ m[^\.{1,2}$]);   # Ignore . and ..
47:           if (-d $file)  # is directory?
48:           {
49:               # directory found add a comment in the sitemap for readability
50:               print "<!-- DIR $prefix/$file -->\n";
51:               
52:               # go one directory level up
53:               $path = $dir .'/'. $file;  # no $file here is a directory name
54:               chdir($path); $level++;    # change working directory 
55:               $path=getcwd();            # get working directory
56:               $path =~ s/\\/\//g ;       # change \ (w32) to / (Internet url)
57: 
58:               getdir($path);             # recursive call of this function
59:             
60:               # getting out of the directory 
61:               # so go level down
62:               chdir("..");  $level--;    # change working directory
63:               $path=getcwd();            # get working directory  
64:               $path =~ s/\\/\//g ;       # change \ (w32) to / (Internet url)   
65:         
66:           } elsif (-f $file)   #no directory perhaps is a file?
67:           {
68:               # it is not a directory but a file
69:               # only look for certain types of extensions           
70:               if ($file =~ m\(htmL|htm|txt|shtml|zip|java|php|jpg|png|jpeg|gif|avi|pdf|mpg|mpeg)$\i)  
71:               {
72:                   # print the <url>   (mandantory)
73:                   print "<url>\n";
74: 
75:                   # calculate Internet location url
76:                   my $filepath=getcwd();     # get working directory
77:                   $filepath =~ s/\\/\//g ;   # change \ (w32) to / (Internet url)
78:                   # need to replace first part of path\.... with htt//...... ($prefix)
79:                   my $location = $filepath;
80:                   $location =~ s/$startlocationDirectory/$prefix/g;
81:                   $location .="/$file";      # not forget to append the filename
82: 
83:                   # some characters are not allowed in urls like & < > and whitespace
84:                   my $replace="&";
85:                   $location =~ s/\&/$replace/g ;  #subtitute & with & 
86:                   $replace="%20";
87:                   $location =~ s/ /$replace/g ;       #substute whitespace with %20        
88: 
89:                   # print the <loc>htt//....myurl</loc>   (mandantory)
90:                   print "  <loc>$location</loc>\n";
91:                   
92:                   printLastmod($location);
93: 
94:                   # calculate a priority 
95:                   my $priority=ceil((1*0.9**$level)*100)/100;
96:                   # print the <priority>x.x</priority> tag 
97:                   print "  <priority>$priority</priority>\n";
98:                 
99:                   # close the <url> tag 
100:                   print "</url>\n";
101:               } 
102:          }        
103:      }
104:      closedir ($dh) || die "Failed to close directo $!";
105:  }
106: 
107:  # print footer with closing </urlset>
108:  print <<FOOTER;
109:  
110:  </urlset>
111:  
112:  <!-- Created by Perl script provided by Comweb Netherlands -->
113:  <!-- (c)2019 Comweb.nl RJHM van den Bergh -->
114:  <!-- Free for non commercial use -->
115:  <!-- use at own risk, beta version -->
116:           
117: FOOTER
118: 
119:  sub printLastmod
120:  {
121:     my $location = shift;
122:     my $agent = new LWP::UserAgent;
123:     my $response = $agent->head($location); # sometimes hangs for 2 minutes
124:     #https://stackoverflow.com/questions/12391671/perl-lwpuseragent-hangs-for-120-seconds-seemingly-randomly-against-a-given-ser
125:     my $lastmod=($response->last_modified);
126:     $lastmod //= 0; # if $lastmod set to undef (in line above) then set it to 0   
127:     if ( $lastmod!=0) # if 0 probably could not get $lastmod  (for example 404 page)
128:     {
129:       my $w3ctime=&w3ctime($lastmod);
130:       print  "  <lastmod>$w3ctime</lastmod>\n";
131:     } else
132:     {
133:       print  "  <!-- WARNING could not get last modified from Internet 404? -->\n"; 
134:     }
135:  }
136:  
137: sub w3ctime()
138: {
139:     my $time_passed_since_epoch=shift;  #  time in seconds since 1 jan 1970 00:00:00
140:     # initialize some variables
141:     my $year=1970;
142:     my $isLeapYear=0;
143:     my $leapYearInSeconds=366*24*60*60;
144:     my $yearInSeconds    =365*24*60*60;
145:     my $start_counter=0;
146:     
147:     
148:     while( $start_counter<=$time_passed_since_epoch)
149:     {
150:       #determine if it is a leap year      
151:       if (($year % 4)==0   ) { $isLeapYear=1; } else { $isLeapYear=0; }
152:       if (($year % 100)==0 ) { $isLeapYear=0; } # exception
153:       if (($year % 400)==0 ) { $isLeapYear=1; } # exception      
154:       
155:       # add seconds of that year to the start counter
156:       if( $isLeapYear ) 
157:       {
158:          $start_counter+=$leapYearInSeconds; 
159:       } else
160:       {
161:          $start_counter+=$yearInSeconds;
162:       }      
163:       $year++;      
164:     }
165:     # $tart passed $time_passed
166:     $year--;
167:     if( $isLeapYear ) 
168:     {
169:       $start_counter-=$leapYearInSeconds; 
170:     } else
171:     {
172:       $start_counter-=$yearInSeconds;
173:     }
174:     
175:     my $month=0;
176:     # can add jan 31 days?
177:     if ( ($start_counter+31*24*60*60)<=$time_passed_since_epoch )
178:     {
179:        $start_counter+=31*24*60*60;  
180:        $month++;
181: 
182:        # can add februari?
183:        my $februariDays=28;
184:        if ($isLeapYear) { $februariDays=29;}
185:        if ( ($start_counter+$februariDays*24*60*60)<=$time_passed_since_epoch )
186:        {
187:           $start_counter+=$februariDays*24*60*60;  
188:           $month++;
189:        
190:           # can add march?
191:           if ( ($start_counter+31*24*60*60)<=$time_passed_since_epoch )
192:           {
193:              $start_counter+=31*24*60*60;  
194:              $month++;
195:        
196:              # can add april?
197:              if ( ($start_counter+30*24*60*60)<=$time_passed_since_epoch )
198:              {
199:                 $start_counter+=30*24*60*60;  
200:                 $month++;
201: 
202:                 # can add may?
203:                 if ( ($start_counter+31*24*60*60)<=$time_passed_since_epoch )
204:                 {
205:                    $start_counter+=31*24*60*60;  
206:                    $month++;
207:                 
208:                    # can add june?
209:                    if ( ($start_counter+30*24*60*60)<=$time_passed_since_epoch )
210:                    {
211:                       $start_counter+=30*24*60*60;  
212:                       $month++;
213: 
214:                       # can add july?
215:                       if ( ($start_counter+31*24*60*60)<=$time_passed_since_epoch )
216:                       {
217:                          $start_counter+=31*24*60*60;  
218:                          $month++;
219: 
220:                          # can add august?
221:                          if ( ($start_counter+31*24*60*60)<=$time_passed_since_epoch )
222:                          {
223:                             $start_counter+=31*24*60*60;  
224:                             $month++;
225: 
226:                             # can add september?
227:                             if ( ($start_counter+30*24*60*60)<=$time_passed_since_epoch )
228:                             {
229:                                $start_counter+=30*24*60*60;  
230:                                $month++;
231: 
232:                                # can add oktober?
233:                                if ( ($start_counter+31*24*60*60)<=$time_passed_since_epoch )
234:                                {
235:                                   $start_counter+=31*24*60*60;  
236:                                   $month++;
237:                                
238:                                   # can add november?
239:                                   if ( ($start_counter+30*24*60*60)<=$time_passed_since_epoch )
240:                                   {
241:                                      $start_counter+=30*24*60*60;  
242:                                      $month++;
243: 
244:                                      # can add december?
245:                                      if ( ($start_counter+31*24*60*60)<=$time_passed_since_epoch )
246:                                      {
247:                                         $start_counter+=31*24*60*60;  
248:                                         $month++;
249: 
250:                                         # if it comes here then something not correct
251:                                         # should not be possible to add full decembre month
252:                                         # because then it is next year
253:                                         die "Oops it shouldn't get here";
254:                                                                                                                      
255:                                      }
256:                                   }
257:                                }
258:                             }
259:                          }
260:                       }
261:                    }
262:                 }
263:              }
264:           }
265:        }
266:     }
267:     
268:     # calculate days in the month lowest day = 0
269:     my $days=floor(($time_passed_since_epoch-$start_counter)/(24*60*60));
270:     $start_counter+=($days*24*60*60);
271:     if (($days+1)>31) { die "Oops days can't be bigger than 31.";}
272:     
273:     #calculate hours lowest = 0
274:     my $hours=floor(($time_passed_since_epoch-$start_counter)/(60*60));
275:     $start_counter+=($hours*60*60);
276:     if ($hours>=24) { die "Oops hours must be less than 24.";}
277:     
278:     # calculate minutes lowest = 0
279:     my $minutes=floor(($time_passed_since_epoch-$start_counter)/(60));
280:     $start_counter+=($minutes*60);
281:     if ($minutes>=60) { die "Oops minutes must be less than 60.";}
282: 
283:     my $seconds=floor($time_passed_since_epoch-$start_counter);
284:     if ($seconds>=60) { die "Oops seconds must be less than 60.";}
285:     
286:     # putting everything in the right format
287:     $year  = sprintf("%04d", $year); 
288:     $month = sprintf("%02d", $month+1);
289:     $days  = sprintf("%02d", $days+1); 
290:     $hours   = sprintf("%02d", $hours);  
291:     $minutes = sprintf("%02d", $minutes); 
292:     $seconds = sprintf("%02d", $seconds);
293:     
294:     my $w3cTimeString="$year-".($month)."-".($days)."T".$hours.":".$minutes.":".$seconds."Z";
295:     return $w3cTimeString; 
296: }