User:Neoconned/LocalRefConverter
Jump to navigation
Jump to search
This is a crudely hacked version of http://en.wikipedia.org/wiki/User:Cyde/Ref_converter. It will convert an article using the note/ref template referencing system to using the new Cite.php references. Cyde Weys' original version is designed to run on a webserver, and to fetch articles directly from wikipedia. This version runs on your PC, fetches the article to convert from a local text file, and saves the converted article to another local text file. To use:
- Install perl on your computer.
- Unlike with the original, you don't need to install any Perl extensions from CPAN.
- Save the code below into a file called wikirefs.txt in whichever directory Perl sees by default.
- Put the article wiki source you want to convert into a file called convert_me.txt in that directory.
- Run perl wikirefs.txt
- The converted article should be a file called convertedFile.txt in that directory.
#!/usr/bin/perl # # "WikiRefs" # This program converts {{note}} and {{ref}} to <references /> style on Wikipedia. # Copyright (C) 2006 Ben "Cyde Weys" McIlwain # Trivially modified by Neoconned (SourceWatch) to run locally, May 2007 # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # # # #--------------------------------------------------- # Script configuration options. # #Set to 1 to enable logging. Set to 0 to disable logging. my $optLogging = 1; # #Specify the location to store the log (must be writable by your httpd process). my $optLogLoc = '/var/log/apache2/refconv.log'; # #--------------------------------------------------- sub writeToLog; #This accumulates the number of possible things that were incorrect with {{ref}}/{{note}}. my $numErrors = 0; print 'Getting Wiki source...'; $data_file="convert_me.txt"; { local $/; open(SLURPING, $data_file) || die("Could not open file!"); $responseContent=<SLURPING>; } my $fullText = $responseContent; print "\n\n"; print "OPENED FILE OK \n\n"; ############################################################################### #This keeps track of the initial length of the article before we make any changes to it. my $preLength = length($fullText); #These two variables accumulate lines of text and are output at the end. my $warnings = ""; my $verbosage = ""; #Get rid of the "How to add a footnote" comment that this script makes superfluous. if ($fullText =~ m/\<\!\-\-[^\n]*add[^\n]*footnote.*?\-\-\>/s) { $fullText =~ s/\<\!\-\-[^\n]*add[^\n]*footnote.*?\-\-\>//s; $warnings .= "Deleting comment on how to add old footnotes, make sure this was done correctly.\n"; } #{{mnb2}} is incredibly broken if ($fullText =~ m/\{\{mnb2/gi) { $warnings .= "Panic, detecting {{mnb2}}, this article is most likely broken and will need manual repair.\n"; } #This goes through the article source looking for citation templates that are over one line. This is #necessary because the citation templates must be inserted into the article text inline or things will break. #This has the side-effect of changing citation templates that aren't part of notes. Oh well. #Then we need to detect if any changes have been made, and if they have, print a warning message to that effect. my $tempText = $fullText; $fullText =~ s/(\{\{cite [^\{\}]*?\}\})/my$x=$1;$x=~s{\n}{}g; $x/egs; if ($tempText ne $fullText) { $warnings .= "Detecting multiple line cite, trying to fix, make sure I don't make any mistakes.\n"; } #Get a list of all matches of {{ref|...}} and {{ref label|...}} and {{ref harv|...}} and {{ref harvard|...}} my @matches = ($fullText =~ m/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*([^\|]*?)\s*(?:\|\s*[^\|\}]*?\s*)*?\}\}/gi); #push @matches, ($fullText =~ m/\{\{mn\s*\|\s*([^\|]*?)\s*\|\s*[^\|\}]*?\s*\}\}/gi); #If there are no {{ref}}s in the article then there's no point in continuing. if ($#matches > -1) { ### This next little section creates @matchesSingle, which consists of @matches minus # any duplicate entries, and @matchesMult, which consists of a list of single entries # of things that did have duplicate entries. It also removes duplicate entries from @matches. # In other words, if @matches was [a,a,b,c,d,d,e], then: # @matches = [a,b,c,d,e] # @matchesSingle = [b,c,e] # @matchesMult = [a,d] my %tempHash; my %multHash; foreach (@matches) { #Note: lc turns all the characters of a string into their lowercase counterparts._ if (exists $tempHash{lc($_)}) { $multHash{lc($_)} = lc($_); } else { $tempHash{lc($_)} = lc($_); } } @matches = sort values %tempHash; my @matchesMult = sort values %multHash; #Subtract set @matchesMult from set @matchesSingle foreach (@matchesMult) { delete $tempHash{$_}; } my @matchesSingle = sort values %tempHash; # # End complicated section. ### if ($#matchesMult >= 0) { $warnings .= "Detecting multiple refs with the same name, make sure I handle this correctly.\n"; } #refCoors is the hash between ref name and note text. my %refCorrs = (); my $finalText = ""; my $firstMatch = 1; my $matched = 0; #Split the full Wiki source into discrete lines and process them sequentially to see if #each line contains a {{note}} or a {{note label}}. If the line does contain a {{note}}, #match it up in the hash with its appropriate ref. If it doesn't match, throw a warning #and comment it out. If it did match, remove it, and replace all removed {{note}}s with a single <references /> foreach (split /\n/, $fullText) { my $thisLine = $_; $matched = 0; #Loop through each of the ref names to see if it matches with any notes on this line. This has O(n*m) efficiency. foreach (@matches) { if ($thisLine =~ m/\{\{(?:mnb2?|note(?:[_ ]label)?)\s*\|\s*\Q$_\E\s*(?:\|\s*[^\{\}]*?\s*)*\}\}\s*(.*)$/i) { my $thisMatch = $1; if ($thisMatch =~ m/(\{\{note[_ ]label[^\}\{]*?\}\})/i) { $thisMatch =~ s/\{\{note[_ ]label\s*[^\}\{]*?\}\}//gi; } #Chop off leading and trailing spaces. $thisMatch =~ s/^\s+//; $thisMatch =~ s/\s+$//; $verbosage .= "Matching up ref \"$_\", removing from list, note is: $thisMatch\n"; $refCorrs{$_} = $thisMatch; $matched = 1; #firstMatch is used to keep track of the first note that has been replaced. The first note is replaced #with <references /> and the rest are just deleted. if ($firstMatch == 1) { if ($fullText !~ m/\<references(\s*\/)?\>/g) { if ($smallFont eq "on") { $finalText .= '<div class="references-small"><references /></div>' . "\n"; } else { $finalText .= "<references />\n"; } } $firstMatch = 0; } } } #If this line had a note with no corresponding ref, comment it out and print a warning message. if ($matched == 0) { if ($thisLine =~ m/\{\{(?:mnb2?|note)\s*\|\s*([^\|]*?)\s*\|?\s*\}\}\s*(.*)$/i) { $warnings .= "Note \"$1\" isn\'t referenced, commenting out, link was: $2\n"; $numErrors++; $finalText .= "<!-- Dead note \"$1\": $2 -->\n"; } else { $finalText = $finalText . $thisLine . "\n"; } } } my $currMatch = ""; #Go through and replace references that were only referenced once with a simple <ref>. foreach $currMatch (@matchesSingle) { if (exists $refCorrs{$currMatch} && $refCorrs{$currMatch} !~ m/^\s*$/) { if ($forceNames eq 'on') { my $refName = $currMatch; if ($refName =~ m/^\d+$/) { $refName = 'ref' . $refName; } $finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*\Q$currMatch\E\s*(?:\|[^\|\}]*?\s*)*?\}\}/\<ref name=\"$refName\"\>$refCorrs{$currMatch}\<\/ref\>/gi; } else { $finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*\Q$currMatch\E\s*(?:\|[^\|\}]*?\s*)*?\}\}/\<ref\>$refCorrs{$currMatch}\<\/ref\>/gi; } $verbosage .= "Replacing ref \"$currMatch\" with full note: \<ref\>$refCorrs{$currMatch}\<\/ref\>\n"; } elsif (exists $refCorrs{$currMatch} && $currMatch =~ m/^\s*$/) { #Deal with blank notes. We don't want to be inserting <ref></ref> into the article. $numErrors++; $warnings .= "Found a blank note, ref is \"$currMatch\"\n"; } else { $numErrors++; $warnings .= "Ref \"$currMatch\" doesn\'t exist in notes. Turning into \{\{citation needed\}\}\n"; } } #Now we need to go through and replace references that were referenced multiple times. #We need to name our references now. foreach $currMatch (@matchesMult) { if (exists $refCorrs{$currMatch} && $refCorrs{$currMatch} !~ m/^\s*$/) { #Cite.php returns an error if the refName is an integer value, so we'll pad it out with a character. my $refName = $currMatch; if ($refName =~ m/^\d+$/) { $refName = 'ref' . $refName; } $finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*\Q$currMatch\E\s*(?:\|[^\|\}]*?\s*)*?\}\}/\<ref name=\"$refName\"\>$refCorrs{$currMatch}\<\/ref\>/i; $finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*\Q$currMatch\E\s*(?:\|[^\|\}]*?\s*)*?\}\}/\<ref name=\"$refName\" \/\>/gi; $verbosage .= "Replacing multiply referenced \"$refName\" with full notes: \<ref\>$refCorrs{$currMatch}\<\/ref\>\n"; } elsif (exists $refCorrs{$currMatch} && $currMatch =~ m/^\s*$/) { #Deal with blank notes. We don't want to be inserting <ref></ref> into the article. $numErrors++; $warnings .= "Found a blank multiply referenced note, ref is \"$currMatch\"\n"; } else { $numErrors++; $warnings .= "Multiple reference \"$currMatch\" doesn\'t exist in notes. Turning into \{\{citation needed\}\}\n"; } } #One more loop through any remaining {{ref}} tags to turn them into {{citation needed}}. $finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*[^\|]*?\s*(?:\|[^\|\}]*?\s*)*?\}\}/\{\{citation needed\}\}/gi; #Remove excess spaces that we may have just made by deleting the content inbetween. if ($finalText =~ m/\n{4,}/gs) { $warnings .= "I think I have found too many consecutive newlines, I am going to remove them, make sure I did this right.\n"; $finalText =~ s/\n{4,}/\n\n/gs; } #Final sanity checks if ($finalText =~ m/\{\{ref/gi) { $warnings .= "Failing sanity check, there may still be some {{ref}}s left.\n"; } if ($finalText =~ m/\{\{note/gi) { $warnings .= "Failing sanity check, there may still be some {{note}}s left.\n"; } if ($finalText =~ m/\{\{mn/gi) { $warnings .= "Failing sanity check, there may still be some Footnote4 stuff left ({{mn}} or {{mnb}}).\n"; } print '<b>Finished</b>.<br>' . "\n"; ##nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn print '\n\n\n'; print 'WRITING OUTPUT FILE \n\n'; open OUT, "> convertedFile.txt" or die "Can't open $outfile : $!"; print OUT $finalText; ##nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn } else { } #Writes log output to a file. sub writeToLog { }