Benutzer:Svebert/HA Bash

Dieses Skript ist die erste Version eines bash-Skriptes um Hauptautoren zu ermitteln. Bislang ist es sehr rudimentär, aber funktionstüchtig.

Hier kommt der Code des Skriptes. Diesen Code einfach in eine leere Datei (mit dem z.B. dem Namen author.sh) kopieren. Bedienungsanweisungen finden sich als Kommentar im Skript.

Der Vorteil dieses Skripts gegenüber anderen (s. Hauptautoren), ist dass nichts extra installiert oder eingestellt werden muss. Dieses Skript ist auf einem normalen Linuxrechner lauffähig.

Achtung: Das Skript ist vom 17.06.2012 also sehr neu und hat daher sicherlich viele Macken und Fehler. Ich werde weiter daran arbeiten. Im Endeffekt soll das Skript eine grafische Wortwolke erzeugen. Aber bis dahin ists noch ein weiter Weg. Bitte meldet mir etwaige Fehler. Mir bekannte Fehler habe ich als Kommentar im Skript aufgeführt und werde diese bald beheben. Es ist ausdrücklich erwünscht den hier gezeigten Code weiterzuentwickeln.

Viel Spaß

#!/bin/bash
#####################################################################################
#Script to count the edits and edited bytes of users for one article of the wikipedia
##################Script by Svebert (06/2012)########################################
##################Version 0.6#######################################################
#####################################################################################
#Fill in the lemma of the article you want to inspect
LEMMA=Trägheitskraft
#Decide whether the output should be sorted by the EDIT_COUNT or the BYTES_EDITED
SORT_BY_COLUMN_PRIMARY=5 # 2 = EDIT_COUNT, 3 = ABS BYTES EDITED, 4 = BYTES EDITED , 5 = SCORE
SORT_BY_COLUMN_SECONDARY=2
#Aggregate IPs y or n, if y then all IPs are handeled as one user named @
#The script is much faster if AGG_IP is set to y
AGG_IP="y"
#That's it. Run the script on your linux terminal: source author.sh
#The result will be printed to the terminal, but you can pipe it to a file if you want to:
#source author.sh > result.txt
#The script downloads the version history of the given lemma and saves it to temporary xml-files

########################Idea of this script#########################################
#For all other main author programs listed at WP:Hauptautor you have to install something. 
#I tried (and try) to write a script which runs without any extra programs on a "normal" linux machine
#The script downloads the verions history via wget and uses the Wiki-API. The downloaded 
#files are xml-files.
#Then the script parses these xml-files via xpath and counts the edits and bytes
#The result is printed to stdout
########################Known Bugs and ToDos########################################
#*xpath prints 'Value: Query didn't return a nodeset.' to stderr for no(?) reason
#*Script is still slow especially for articles with a long version history
#*exclude IPs and/or bots
#############################Nothing to be edited further down from here############

#Delete arrays in case of the script was quit abnormally in the run before this one
unset USER_NAMES
unset CUM_BYTES
unset ABS_CUM_BYTES
unset EDIT_COUNT

REV_LIMIT=75 # rev limit maximum is 500
TMP_FILE='tmp.xml'
LEMMA=$(echo "$LEMMA" | sed "s/\s/\_/g") #replace space in LEMMA with underline, otherwise the URL doesnt work
WIKI_URL="http://de.wikipedia.org/w/api.php?action=query&prop=revisions&format=xml&rvprop=timestamp|user|size|comment&rvlimit=$REV_LIMIT&rvdir=older&titles=$LEMMA"

#Download Version history
echo "Starting download..." >&2
AGAIN=1
CONTINUE_TAG=""
RVSTARTID=""
file_no=0
USER_STR=""
USER_STR_T=""
#download until all revisions are fetched
while [ "$AGAIN" == 1 ]
do
	wget -O $TMP_FILE$file_no $WIKI_URL$CONTINUE_TAG

#check if all versions where downloaded
	RVSTARTID=$(echo $(xpath -q -e "/api/query-continue/revisions/@rvcontinue" $TMP_FILE$file_no) | sed "s/\"//g")
	if [ "$RVSTARTID" == "" ];
	then
		AGAIN=0
		CONTINUE_TAG=""
	else
		CONTINUE_TAG="&$RVSTARTID"
		AGAIN=1
	fi
	((file_no++))
done
echo "Finished download ($file_no files)." >&2


#declare new arrays and helper functions
declare -a USER_NAMES
declare -a CUM_BYTES
declare -a ABS_CUM_BYTES
declare -a EDIT_COUNT
user_counter=0
edit_counter=0

abs ()                             #  Absolute value
{                                  #  Uses global "value" variable.
  if [ "$1" -lt 0 ]                #  If negative
  then                             #+ then
    let "value = 0 - $1"           #+ change sign,
  else                             #+ else
    let "value = $1"               #+ leave it alone.
  fi
}
# Test an IP address for validity:
function valid_ip()
{
    local  ip=$1
    local  stat=1

    if [[ $ip =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
        OIFS=$IFS
        IFS='.'
        ip=($ip)
        IFS=$OIFS
        [[ ${ip[0]} -le 255 && ${ip[1]} -le 255 \
            && ${ip[2]} -le 255 && ${ip[3]} -le 255 ]]
        stat=$?
    fi
    return $stat
}

loop_counter=0
total_revs=$(($file_no*$REV_LIMIT))
#loop over all files
for (( i=0; i<$file_no; i++ ))
do
	#get number of entries in file
	NO_REVS=$(xpath -q -e "count(//rev)" $TMP_FILE$i)
	edit_counter=$(( edit_counter + $NO_REVS ))
	#loop over all entries in file
	for (( l=1; l<=$NO_REVS; l++ ))
	do
		#read in user name and revision size
		exists="n"
		CURRENT_USER=$(echo $(echo $(xpath -q -e "//rev[$l]/@user" $TMP_FILE$i) | sed "s/user=//g") | sed "s/\"//g")
		if [ $AGG_IP == "y" ]
		then
			#check if CURRENT_USER is IP
			if valid_ip $CURRENT_USER;
			then
				CURRENT_USER="@" #replace IP-Number by @ because its the only symbol not allowed in user names
			fi
		fi
		CURRENT_BYTES=$(echo $( echo $(xpath -q -e "//rev[$l]/@size" $TMP_FILE$i) | sed "s/size=//g" ) | sed "s/\"//g") 
		#get revision size of one edit before this one to calculate the contributed bytes
		if [ "$l" -lt "$NO_REVS" ]
		then	
			ll=$(( l+1 ))
			CURRENT_BYTES2=$(echo $( echo $(xpath -q -e "//rev[$ll]/@size" $TMP_FILE$i) | sed "s/size=//g" ) | sed "s/\"//g")
		else
			ii=$(( i+1 ))
			if [ "$ii" -lt "$file_no" ]
			then
				CURRENT_BYTES2=$(echo $( echo $(xpath -q -e "//rev[1]/@size" $TMP_FILE$ii) | sed "s/size=//g" ) | sed "s/\"//g")
			else
				CURRENT_BYTES2=0
			fi 
		fi
		#calculate contributed bytes and its absolute value
		CURRENT_CUM_BYTES=$(( CURRENT_BYTES-$CURRENT_BYTES2 ))
		abs $CURRENT_CUM_BYTES
		ABS_CURRENT_CUM_BYTES=$value
		#check wether this user is already in the USER_NAMES array
		if [ "$user_counter" -gt 0 ]
		then
			for (( k=$user_counter-1; k>=0; k-- ))
			do
				if [ "$CURRENT_USER" == "${USER_NAMES[k]}" ]
				then
					exists="y"
					current_user_index=$k
					break;
				fi
			done
		else
			exists="n"
		fi
		#either add CURRENT_USER as new user to the array or add only the values to the right index
		#of the array
		if [ $exists == "n" ]
		then
			current_user_index=$user_counter
			USER_NAMES[current_user_index]=$CURRENT_USER
			EDIT_COUNT[current_user_index]=1
			CUM_BYTES[current_user_index]=$CURRENT_CUM_BYTES
			ABS_CUM_BYTES[current_user_index]=$ABS_CURRENT_CUM_BYTES
			echo "USER($loop_counter/$total_revs)=${USER_NAMES[current_user_index]}" >&2
			((user_counter++))
		else
			echo "USER($loop_counter/$total_revs)+=${USER_NAMES[current_user_index]}" >&2
			(( EDIT_COUNT[current_user_index]++ ))
			CUM_BYTES[current_user_index]=$(( ${CUM_BYTES[current_user_index]} + $CURRENT_CUM_BYTES ))
			ABS_CUM_BYTES[current_user_index]=$(( ${ABS_CUM_BYTES[current_user_index]} + $ABS_CURRENT_CUM_BYTES ))
		fi
		TOTAL_EDITED_BYTES=$(( TOTAL_EDITED_BYTES + $ABS_CURRENT_CUM_BYTES ))
		((loop_counter++))
	done
done


#Write data as text-table to screen and sort by SORT_BY_COLUMN
echo "Printing results..." >&2
TOTAL_EDITS=$edit_counter
UNIQUE_USERS=$user_counter
echo -e "#LEMMA=$LEMMA"
echo -e "#TOTAL_EDITS=$TOTAL_EDITS, #TOTAL_EDITED_BYTES=$TOTAL_EDITED_BYTES, #UNIQUE_USERS=$UNIQUE_USERS"
echo -e "#USER\t#EDITS\t#ABS_BYTES_EDITED\t#BYTES_EDITED\t#SCORE"
counter=0
for i in "${USER_NAMES[@]}"
do
	tmp_usr_nm=$(echo "$i" | sed "s/\s/\_/g")
	bytes=${ABS_CUM_BYTES[counter]}
	edit_count=${EDIT_COUNT[counter]}
	SCORE=$( echo -e "scale=4;$edit_count/$TOTAL_EDITS + ($edit_count-1)/$edit_count*$bytes/$TOTAL_EDITED_BYTES" | bc )
	echo -e "$tmp_usr_nm\t${EDIT_COUNT[counter]}\t${ABS_CUM_BYTES[counter]}\t${CUM_BYTES[counter]}\t$SCORE"
	((counter++))
done |
sort -rn -k $SORT_BY_COLUMN_PRIMARY -k $SORT_BY_COLUMN_SECONDARY | column -t
echo "done." >&2

#delete arrays
unset USER_ARR
unset USER_NAMES
unset CUM_BYTES
unset ABS_CUM_BYTES
unset EDIT_COUNT

#remove tmp files
for (( i=0;i<$file_no;i++ ))
do
	rm $TMP_FILE$i
done