Wikidata · lindsayerickson · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026
diff --git a/README.md b/README.md
@@ -3,14 +3,16 @@
 
 ## Getting Started
 ### Prerequisites
-You need to have `Maven`, `OpenJDK 8` and `Python 2` installed.
+You need to have `Maven`, `OpenJDK 8` and `Python 3` installed.
 
 ### Installing
 ```shell
 $ mvn clean package
 ```
 
 ### Running the main Java log analyser
+(Note that you probably don't want to do this and should just run the Python harness described below,
+because it does some necessary work to set arguments and create directories.)
 ```shell
 # Processes the example SPARQL log files into exampleMonthsFolder/exampleMonth/processedLogData
 $ mvn exec:java@QueryAnalysis -Dexec.args="-w exampleMonthsFolder/exampleMonth -logging"
@@ -20,17 +22,41 @@ $ mvn exec:java@QueryAnalysis -Dexec.args="--help"
 
 ```
 
-**Important:** In order to not flush the command line with error messages all uncatched Runtime Exceptions are being written to the log files, residing in the logs/ folder, so please have a look at those regularly. The logs are not generated by default, so you should enable them using the `-l` option.
+**Important:** In order to not flush the command line with error messages all uncaught Runtime Exceptions are being written to the log files, residing in the logs/ folder, so please have a look at those regularly. The logs are not generated by default, so you should enable them using the `-l` option.
 
-### Running the QueryAnalysis-Script
-The QueryAnalysis-Script handles both steps: Extraction using hive and processing using the java application. Extraction using hive only works on the server, but is ignored if the month exists in the months folder. To run the QueryAnalysis-Script locally, you need to provide the local months folder.
+### Running the QueryAnalysis script
+The QueryAnalysis script handles both steps: extraction using hive and processing using the java application.
+Extraction using hive only works on the server, but is ignored if the month exists in the months folder.
+To run the QueryAnalysis script locally, you need to provide the local months folder.
 ```shell
-$ python QueryAnalysis exampleMonth -m ../exampleMonthsFolder
+# The -l option enables logging
+$ python3 -m tools.QueryAnalysis exampleMonth -m ../exampleMonthsFolder -l
 
 # You can also specify multiple months in the same directory by separating them using commas
-$ python QueryAnalysis exampleMonth,otherMonth -m ../exampleMonthsFolder
+$ python3 -m tools.QueryAnalysis exampleMonth,otherMonth -m ../exampleMonthsFolder -l
 ```
 
+You will also need to update `tools/config.py` with paths in your own directory.
+
+## Running the Anonymization script
+After you've extracted the raw query data (done as the first step in the `QueryAnalysis` script above),
+you can anonymize the extracted queries for the specified month(s).
+```shell
+$ python3 -m tools.Anonymize exampleMonth -m exampleMonthsFolder/ -l
+```
+
+## Caveat emptor
+
+This code won't work in the current form on the server.
+The `hive` call in `tools/QueryAnalysis.py` needs to be updated with the current location of the relevant data.
+See comments in that file.
+
+Depending on what is ultimately extracted from logs, downstream changes in the Java code will probably be necessary.
+For example, `InputHandlerTSV.java` currently expects a URL-encoded query, but as of March 2026 the logs are
+not stored in this form.
+
+Additionally, it will be important to verify that the call to `StandardizingSPARQLParser.anonymize()`
+in `OutputHandlerAnonymizer.java` correctly anonymizes the input queries.
 
 ## License
 The code in this repository is released under the [Apache 2.0](LICENSE.txt) license. External libraries used may have their own licensing terms.
diff --git a/tools/Anonymize.py b/tools/Anonymize.py
@@ -1,10 +1,10 @@
 import argparse
-import config
+from . import config
 import os
 import subprocess
 import sys
 
-from utility import utility
+from .utility import utility
 
 parser = argparse.ArgumentParser("This script creates an anonymous dataset from the rawLogData.")
 parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and "
@@ -38,17 +38,16 @@
     mavenCall.append(mavenArguments)
 
     owd = os.getcwd()
-    os.chdir("..")
 
-    print "Starting data processing using Anonymizer for " + monthName + "."
+    print("Starting data processing using Anonymizer for " + monthName + ".")
 
     if subprocess.call(['mvn', 'clean', 'package']) != 0:
-        print "ERROR: Could not package the java application."
+        print("ERROR: Could not package the java application.")
         sys.exit(1)
 
     if subprocess.call(mavenCall) != 0:
-        print("ERROR: Could not execute the java application. Check the logs "
-              + "for details or rerun this script with -l to generate logs.")
+        print(("ERROR: Could not execute the java application. Check the logs "
+              + "for details or rerun this script with -l to generate logs."))
         sys.exit(1)
 
     os.chdir(owd)
diff --git a/tools/Anonymize.py.bak b/tools/Anonymize.py.bak
@@ -0,0 +1,54 @@
+import argparse
+import config
+import os
+import subprocess
+import sys
+
+from utility import utility
+
+parser = argparse.ArgumentParser("This script creates an anonymous dataset from the rawLogData.")
+parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and "
+                    + "execute anyways", action="store_true")
+parser.add_argument("--threads", "-t", default=10, type=int, help="The number "
+                    + "of threads to run the java program with (default 7).")
+parser.add_argument("--logging", "-l", help="Enables file logging.",
+                    action="store_true")
+parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder,
+                    type=str,
+                    help="The folder in which the months directory are "
+                    + "residing.")
+parser.add_argument("--unanonymizedStringLength", "-u", default=10, type=int,
+                    help="Strings of this length or lower should not be anonymized. Default is ten.")
+parser.add_argument("months", type=str, help="The months to be processed")
+
+if (len(sys.argv[1:]) == 0):
+    parser.print_help()
+    parser.exit()
+
+args = parser.parse_args()
+
+for monthName in args.months.split(","):
+
+    mavenCall = ['mvn', 'exec:java@Anonymizer']
+
+    month = utility.addMissingSlash(os.path.abspath(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(monthName)))
+    mavenArguments = '-Dexec.args=-w ' + month + ' -n ' + str(args.threads) + ' -u ' + str(args.unanonymizedStringLength)
+    if args.logging:
+        mavenArguments += " -l"
+    mavenCall.append(mavenArguments)
+
+    owd = os.getcwd()
+    os.chdir("..")
+
+    print "Starting data processing using Anonymizer for " + monthName + "."
+
+    if subprocess.call(['mvn', 'clean', 'package']) != 0:
+        print "ERROR: Could not package the java application."
+        sys.exit(1)
+
+    if subprocess.call(mavenCall) != 0:
+        print("ERROR: Could not execute the java application. Check the logs "
+              + "for details or rerun this script with -l to generate logs.")
+        sys.exit(1)
+
+    os.chdir(owd)
diff --git a/tools/QueryAnalysis.py b/tools/QueryAnalysis.py
@@ -7,9 +7,8 @@
 import subprocess
 import sys
 import gzip
-import unifyQueryTypes
-from utility import utility
-import config
+from .utility import utility
+from . import config
 
 os.nice(19)
 
@@ -79,8 +78,8 @@
     if os.path.isfile(utility.addMissingSlash(args.monthsFolder)
                       + utility.addMissingSlash(monthName) + "locked") \
        and not args.ignoreLock:
-        print "ERROR: The month " + monthName + " is being edited at the " \
-        + "moment. Use -i if you want to force the execution of this script."
+        print("ERROR: The month " + monthName + " is being edited at the " \
+        + "moment. Use -i if you want to force the execution of this script.")
         sys.exit()
 
     month = utility.addMissingSlash(os.path.abspath(utility.addMissingSlash(args.monthsFolder)
@@ -93,8 +92,8 @@
     # If the month directory does not exist it is being created along with
     # the directories for raw and processed log data.
     if not os.path.exists(month):
-        print("Starting data extraction from wmf.wdqs_extract for "
-              + monthName + ".")
+        print(("Starting data extraction from wmf.wdqs_extract for "
+              + monthName + "."))
 
         os.makedirs(month)
         os.makedirs(processedLogDataDirectory)
@@ -103,7 +102,7 @@
         # For each day we send a command to hive that extracts all entries for
         # this day (in the given month and year) and writes them to temporary
         # files.
-        for day in xrange(1, months[monthName][1] + 1):
+        for day in range(1, months[monthName][1] + 1):
             arguments = ['hive', '-e']
 
             os.makedirs(tempDirectory)
@@ -116,26 +115,35 @@
                 hive_call += field + ", "
             hive_call = hive_call[:-2] + " "
 
+            ########## NEEDS UPDATE! ##############
+            # This hive call is obsolete and doesn't return any results.
+            # The wmf.wdqs_extract data store is no longer the right place to look.
+            # The query needs to be updated based on the current internal data schema.
+            #
+            # The only fields that are used in the anonymization part of the code
+            # are |uri_query| and |ts|; the others are for analysis only.
+            #
+            # |uri_query| is assumed to have the form "?query=<URL-encoded query>"
             hive_call += ' from wmf.wdqs_extract where uri_query<>"" ' \
                     + 'and year=\'' + str(args.year) +  '\' and month=\'' \
                     + str(months[monthName][0]) + '\' and day=\'' + str(day) + '\''
 
             arguments.append(hive_call)
             if subprocess.call(arguments) != 0:
-                print("ERROR: Raw data for month " + monthName + " does not "
-                      + "exist but could not be extracted using hive.")
+                print(("ERROR: Raw data for month " + monthName + " does not "
+                      + "exist but could not be extracted using hive."))
                 sys.exit(1)
 
             # The content of the temporary files is then copied to the actual
             # raw log data file (with added headers)
             with gzip.open(rawLogDataDirectory + "QueryCnt"
                            + "%02d"%day  + ".tsv.gz", "wb") as dayfile:
-                dayfile.write(header)
+                dayfile.write(header.encode('utf-8'))
 
                 for filename in glob.glob(tempDirectory + '*'):
                     with open(filename) as temp:
                         for line in temp:
-                            dayfile.write(line)
+                            dayfile.write(line.encode('utf-8'))
 
             shutil.rmtree(tempDirectory)
 
@@ -162,17 +170,16 @@
     mavenCall.append(mavenArguments)
 
     owd = os.getcwd()
-    os.chdir("..")
 
-    print "Starting data processing using QueryAnalysis for " + monthName + "."
+    print("Starting data processing using QueryAnalysis for " + monthName + ".")
 
     if subprocess.call(['mvn', 'clean', 'package']) != 0:
-        print "ERROR: Could not package the java application."
+        print("ERROR: Could not package the java application.")
         sys.exit(1)
 
     if subprocess.call(mavenCall) != 0:
-        print("ERROR: Could not execute the java application. Check the logs "
-              + "for details or rerun this script with -l to generate logs.")
+        print(("ERROR: Could not execute the java application. Check the logs "
+              + "for details or rerun this script with -l to generate logs."))
         sys.exit(1)
 
     os.chdir(owd)