From 157a20d58e30ca4aceb97e38c5ed94a17f1f8059 Mon Sep 17 00:00:00 2001 From: Lindsay Erickson Date: Fri, 27 Feb 2026 10:11:25 -0500 Subject: [PATCH 1/7] Run 2to3.py on the entire repo. --- tools/Anonymize.py | 12 +- tools/Anonymize.py.bak | 54 ++++ tools/QueryAnalysis.py | 26 +- tools/QueryAnalysis.py.bak | 178 +++++++++++++ tools/automatedBotClassification.py | 28 +- tools/automatedBotClassification.py.bak | 247 ++++++++++++++++++ tools/botClassificationHelper.py | 18 +- tools/botClassificationHelper.py.bak | 151 +++++++++++ tools/countAnything.py | 16 +- tools/countAnything.py.bak | 66 +++++ tools/countRdfProperties.py | 16 +- tools/countRdfProperties.py.bak | 65 +++++ tools/countTools.py | 12 +- tools/countTools.py.bak | 51 ++++ tools/countValid.py | 10 +- tools/countValid.py.bak | 48 ++++ tools/createTestData.py | 18 +- tools/createTestData.py.bak | 82 ++++++ tools/exampleDatasetGenerator.py | 8 +- tools/exampleDatasetGenerator.py.bak | 44 ++++ tools/extract.py | 14 +- tools/extract.py.bak | 186 +++++++++++++ tools/extractCachedData.py | 20 +- tools/extractCachedData.py.bak | 138 ++++++++++ tools/extractQueryTypeDataRanking.py | 36 +-- tools/extractQueryTypeDataRanking.py.bak | 141 ++++++++++ tools/extractQueryTypeDataset.py | 14 +- tools/extractQueryTypeDataset.py.bak | 91 +++++++ tools/featureVectors.py | 12 +- tools/featureVectors.py.bak | 108 ++++++++ tools/fieldEntriesDaysApart.py | 18 +- tools/fieldEntriesDaysApart.py.bak | 139 ++++++++++ tools/fieldRanking.py | 10 +- tools/fieldRanking.py.bak | 114 ++++++++ tools/generalStat.py | 18 +- tools/generalStat.py.bak | 84 ++++++ tools/geoHeatMap.py | 8 +- tools/geoHeatMap.py.bak | 72 +++++ tools/getDriveStatistics.py | 24 +- tools/getDriveStatistics.py.bak | 74 ++++++ tools/getHourlyMetricCount.py | 22 +- tools/getHourlyMetricCount.py.bak | 94 +++++++ tools/getSparqlStatistic.py | 14 +- tools/getSparqlStatistic.py.bak | 114 ++++++++ tools/joinMonth.py | 10 +- tools/joinMonth.py.bak | 68 +++++ tools/oldScripts/getProperties.py | 2 +- tools/oldScripts/getProperties.py.bak | 35 +++ tools/operatorUsageStatistic.py | 16 +- tools/operatorUsageStatistic.py.bak | 151 +++++++++++ tools/plotHourlyMetricCount.py | 10 +- tools/plotHourlyMetricCount.py.bak | 161 ++++++++++++ tools/postprocess/processdata.py | 26 +- tools/postprocess/processdata.py.bak | 123 +++++++++ tools/rankDataTypes.py | 14 +- tools/rankDataTypes.py.bak | 54 ++++ tools/showData.py | 20 +- tools/showData.py.bak | 105 ++++++++ tools/sortByTime.py | 10 +- tools/sortByTime.py.bak | 60 +++++ tools/utility/utility.py | 12 +- tools/utility/utility.py.bak | 82 ++++++ tools/visualisePropertyTree/createHtml.py | 2 +- tools/visualisePropertyTree/createHtml.py.bak | 165 ++++++++++++ tools/xyMapping.py | 10 +- tools/xyMapping.py.bak | 134 ++++++++++ 66 files changed, 3732 insertions(+), 253 deletions(-) create mode 100644 tools/Anonymize.py.bak create mode 100644 tools/QueryAnalysis.py.bak create mode 100644 tools/automatedBotClassification.py.bak create mode 100644 tools/botClassificationHelper.py.bak create mode 100644 tools/countAnything.py.bak create mode 100644 tools/countRdfProperties.py.bak create mode 100644 tools/countTools.py.bak create mode 100644 tools/countValid.py.bak create mode 100644 tools/createTestData.py.bak create mode 100644 tools/exampleDatasetGenerator.py.bak create mode 100644 tools/extract.py.bak create mode 100644 tools/extractCachedData.py.bak create mode 100644 tools/extractQueryTypeDataRanking.py.bak create mode 100644 tools/extractQueryTypeDataset.py.bak create mode 100644 tools/featureVectors.py.bak create mode 100644 tools/fieldEntriesDaysApart.py.bak create mode 100644 tools/fieldRanking.py.bak create mode 100644 tools/generalStat.py.bak create mode 100644 tools/geoHeatMap.py.bak create mode 100644 tools/getDriveStatistics.py.bak create mode 100644 tools/getHourlyMetricCount.py.bak create mode 100644 tools/getSparqlStatistic.py.bak create mode 100644 tools/joinMonth.py.bak create mode 100644 tools/oldScripts/getProperties.py.bak create mode 100644 tools/operatorUsageStatistic.py.bak create mode 100644 tools/plotHourlyMetricCount.py.bak create mode 100644 tools/postprocess/processdata.py.bak create mode 100644 tools/rankDataTypes.py.bak create mode 100644 tools/showData.py.bak create mode 100644 tools/sortByTime.py.bak create mode 100644 tools/utility/utility.py.bak create mode 100644 tools/visualisePropertyTree/createHtml.py.bak create mode 100644 tools/xyMapping.py.bak diff --git a/tools/Anonymize.py b/tools/Anonymize.py index f1f69f6..731df4a 100644 --- a/tools/Anonymize.py +++ b/tools/Anonymize.py @@ -1,10 +1,10 @@ import argparse -import config +from . import config import os import subprocess import sys -from utility import utility +from .utility import utility parser = argparse.ArgumentParser("This script creates an anonymous dataset from the rawLogData.") parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and " @@ -40,15 +40,15 @@ owd = os.getcwd() os.chdir("..") - print "Starting data processing using Anonymizer for " + monthName + "." + print("Starting data processing using Anonymizer for " + monthName + ".") if subprocess.call(['mvn', 'clean', 'package']) != 0: - print "ERROR: Could not package the java application." + print("ERROR: Could not package the java application.") sys.exit(1) if subprocess.call(mavenCall) != 0: - print("ERROR: Could not execute the java application. Check the logs " - + "for details or rerun this script with -l to generate logs.") + print(("ERROR: Could not execute the java application. Check the logs " + + "for details or rerun this script with -l to generate logs.")) sys.exit(1) os.chdir(owd) diff --git a/tools/Anonymize.py.bak b/tools/Anonymize.py.bak new file mode 100644 index 0000000..f1f69f6 --- /dev/null +++ b/tools/Anonymize.py.bak @@ -0,0 +1,54 @@ +import argparse +import config +import os +import subprocess +import sys + +from utility import utility + +parser = argparse.ArgumentParser("This script creates an anonymous dataset from the rawLogData.") +parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and " + + "execute anyways", action="store_true") +parser.add_argument("--threads", "-t", default=10, type=int, help="The number " + + "of threads to run the java program with (default 7).") +parser.add_argument("--logging", "-l", help="Enables file logging.", + action="store_true") +parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, + help="The folder in which the months directory are " + + "residing.") +parser.add_argument("--unanonymizedStringLength", "-u", default=10, type=int, + help="Strings of this length or lower should not be anonymized. Default is ten.") +parser.add_argument("months", type=str, help="The months to be processed") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +for monthName in args.months.split(","): + + mavenCall = ['mvn', 'exec:java@Anonymizer'] + + month = utility.addMissingSlash(os.path.abspath(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(monthName))) + mavenArguments = '-Dexec.args=-w ' + month + ' -n ' + str(args.threads) + ' -u ' + str(args.unanonymizedStringLength) + if args.logging: + mavenArguments += " -l" + mavenCall.append(mavenArguments) + + owd = os.getcwd() + os.chdir("..") + + print "Starting data processing using Anonymizer for " + monthName + "." + + if subprocess.call(['mvn', 'clean', 'package']) != 0: + print "ERROR: Could not package the java application." + sys.exit(1) + + if subprocess.call(mavenCall) != 0: + print("ERROR: Could not execute the java application. Check the logs " + + "for details or rerun this script with -l to generate logs.") + sys.exit(1) + + os.chdir(owd) diff --git a/tools/QueryAnalysis.py b/tools/QueryAnalysis.py index c2cc315..bca8591 100644 --- a/tools/QueryAnalysis.py +++ b/tools/QueryAnalysis.py @@ -8,8 +8,8 @@ import sys import gzip import unifyQueryTypes -from utility import utility -import config +from .utility import utility +from . import config os.nice(19) @@ -79,8 +79,8 @@ if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(monthName) + "locked") \ and not args.ignoreLock: - print "ERROR: The month " + monthName + " is being edited at the " \ - + "moment. Use -i if you want to force the execution of this script." + print("ERROR: The month " + monthName + " is being edited at the " \ + + "moment. Use -i if you want to force the execution of this script.") sys.exit() month = utility.addMissingSlash(os.path.abspath(utility.addMissingSlash(args.monthsFolder) @@ -93,8 +93,8 @@ # If the month directory does not exist it is being created along with # the directories for raw and processed log data. if not os.path.exists(month): - print("Starting data extraction from wmf.wdqs_extract for " - + monthName + ".") + print(("Starting data extraction from wmf.wdqs_extract for " + + monthName + ".")) os.makedirs(month) os.makedirs(processedLogDataDirectory) @@ -103,7 +103,7 @@ # For each day we send a command to hive that extracts all entries for # this day (in the given month and year) and writes them to temporary # files. - for day in xrange(1, months[monthName][1] + 1): + for day in range(1, months[monthName][1] + 1): arguments = ['hive', '-e'] os.makedirs(tempDirectory) @@ -122,8 +122,8 @@ arguments.append(hive_call) if subprocess.call(arguments) != 0: - print("ERROR: Raw data for month " + monthName + " does not " - + "exist but could not be extracted using hive.") + print(("ERROR: Raw data for month " + monthName + " does not " + + "exist but could not be extracted using hive.")) sys.exit(1) # The content of the temporary files is then copied to the actual @@ -164,15 +164,15 @@ owd = os.getcwd() os.chdir("..") - print "Starting data processing using QueryAnalysis for " + monthName + "." + print("Starting data processing using QueryAnalysis for " + monthName + ".") if subprocess.call(['mvn', 'clean', 'package']) != 0: - print "ERROR: Could not package the java application." + print("ERROR: Could not package the java application.") sys.exit(1) if subprocess.call(mavenCall) != 0: - print("ERROR: Could not execute the java application. Check the logs " - + "for details or rerun this script with -l to generate logs.") + print(("ERROR: Could not execute the java application. Check the logs " + + "for details or rerun this script with -l to generate logs.")) sys.exit(1) os.chdir(owd) \ No newline at end of file diff --git a/tools/QueryAnalysis.py.bak b/tools/QueryAnalysis.py.bak new file mode 100644 index 0000000..c2cc315 --- /dev/null +++ b/tools/QueryAnalysis.py.bak @@ -0,0 +1,178 @@ +import argparse +import calendar +from datetime import datetime +import glob +import os +import shutil +import subprocess +import sys +import gzip +import unifyQueryTypes +from utility import utility +import config + +os.nice(19) + +months = {'january': [1, 31], + 'february': [2, 28], + 'march': [3, 31], + 'april': [4, 30], + 'may': [5, 31], + 'june': [6, 30], + 'july': [7, 31], + 'august': [8, 31], + 'september': [9, 30], + 'october': [10, 31], + 'november': [11, 30], + 'december': [12, 31]} + +parser = argparse.ArgumentParser("This script extracts the raw log data (if " + + "it was not already done), processes them" + + " using the java application and unifies " + + "the query types.") +parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and " + + "execute anyways", action="store_true") +parser.add_argument("--threads", "-t", default=6, type=int, help="The number " + + "of threads to run the java program with (default 7).") +parser.add_argument("--logging", "-l", help="Enables file logging.", + action="store_true") +parser.add_argument("--noBotMetrics", "-b", help="Disables metric calculation" + + " for bot queries.", action="store_true") +parser.add_argument("--noDynamicQueryTypes", "-d", help="Disables dynamic " + + "generation of query types.", action="store_true") +parser.add_argument("--noGzipOutput", "-g", help="Disables gzipping of the " + + "output files.", action="store_true") +parser.add_argument("--noExampleQueriesOutput", "-e", help="Disables the " + + "matching of example queries.", action="store_true") +parser.add_argument("--withUniqueQueryDetection", "-u", help="Enable unique query detection", action="store_true") +parser.add_argument("--dbLocation", "-p", type = str, default = config.dbLocation, help = "The path of the uniqueQueriesMapDb file.") +parser.add_argument("--queryTypeMapLocation", "-q", type = str, default = config.queryTypeMapDbLocation, help = "The path of the query type map db file. Default is in the working directory.") +parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, + help="The folder in which the months directory are " + + "residing.") +parser.add_argument("--year", "-y", default=datetime.now().year, type=int, + help="The year to be processed (default current year).") +parser.add_argument("months", type=str, help="The months to be processed") + +# These are the field we extract from wmf.wdqs_extract that form the raw +# log data. They are not configurable via argument because the java program +# does not detect headers and thus depends on this specific order. +fields = ["uri_query", "uri_path", "user_agent", "ts", "agent_type", + "hour", "http_status"] + +header = "" +for field in fields: + header += field + "\t" +header = header[:-1] + "\n" + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +if calendar.isleap(args.year): + months['february'][1] = 29 + +for monthName in args.months.split(","): + if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + + utility.addMissingSlash(monthName) + "locked") \ + and not args.ignoreLock: + print "ERROR: The month " + monthName + " is being edited at the " \ + + "moment. Use -i if you want to force the execution of this script." + sys.exit() + + month = utility.addMissingSlash(os.path.abspath(utility.addMissingSlash(args.monthsFolder) + + utility.addMissingSlash(monthName))) + + processedLogDataDirectory = month + "processedLogData/" + rawLogDataDirectory = month + "rawLogData/" + tempDirectory = rawLogDataDirectory + "temp/" + + # If the month directory does not exist it is being created along with + # the directories for raw and processed log data. + if not os.path.exists(month): + print("Starting data extraction from wmf.wdqs_extract for " + + monthName + ".") + + os.makedirs(month) + os.makedirs(processedLogDataDirectory) + os.makedirs(rawLogDataDirectory) + + # For each day we send a command to hive that extracts all entries for + # this day (in the given month and year) and writes them to temporary + # files. + for day in xrange(1, months[monthName][1] + 1): + arguments = ['hive', '-e'] + + os.makedirs(tempDirectory) + hive_call = 'insert overwrite local directory \'' + tempDirectory \ + + '\' row format delimited fields terminated ' \ + + 'by \'\\t\' select ' + + # We add all the fields to the request + for field in fields: + hive_call += field + ", " + hive_call = hive_call[:-2] + " " + + hive_call += ' from wmf.wdqs_extract where uri_query<>"" ' \ + + 'and year=\'' + str(args.year) + '\' and month=\'' \ + + str(months[monthName][0]) + '\' and day=\'' + str(day) + '\'' + + arguments.append(hive_call) + if subprocess.call(arguments) != 0: + print("ERROR: Raw data for month " + monthName + " does not " + + "exist but could not be extracted using hive.") + sys.exit(1) + + # The content of the temporary files is then copied to the actual + # raw log data file (with added headers) + with gzip.open(rawLogDataDirectory + "QueryCnt" + + "%02d"%day + ".tsv.gz", "wb") as dayfile: + dayfile.write(header) + + for filename in glob.glob(tempDirectory + '*'): + with open(filename) as temp: + for line in temp: + dayfile.write(line) + + shutil.rmtree(tempDirectory) + + # We build the call to execute the java application with the location of + # the files, the number of threads to use and any optional arguments needed + + mavenCall = ['mvn', 'exec:java@QueryAnalysis'] + + mavenArguments = '-Dexec.args=-w ' + month + ' -t ' + str(args.threads) + ' -p ' + args.dbLocation + " -q " + args.queryTypeMapLocation + + if args.logging: + mavenArguments += " -l" + if args.noBotMetrics: + mavenArguments += " -b" + if args.noDynamicQueryTypes: + mavenArguments += " -d" + if args.noGzipOutput: + mavenArguments += " -g" + if args.noExampleQueriesOutput: + mavenArguments += " -e" + if args.withUniqueQueryDetection: + mavenArguments += " -u" + + mavenCall.append(mavenArguments) + + owd = os.getcwd() + os.chdir("..") + + print "Starting data processing using QueryAnalysis for " + monthName + "." + + if subprocess.call(['mvn', 'clean', 'package']) != 0: + print "ERROR: Could not package the java application." + sys.exit(1) + + if subprocess.call(mavenCall) != 0: + print("ERROR: Could not execute the java application. Check the logs " + + "for details or rerun this script with -l to generate logs.") + sys.exit(1) + + os.chdir(owd) \ No newline at end of file diff --git a/tools/automatedBotClassification.py b/tools/automatedBotClassification.py index 4d1c43e..9f6a633 100644 --- a/tools/automatedBotClassification.py +++ b/tools/automatedBotClassification.py @@ -1,4 +1,4 @@ -from __future__ import print_function + import argparse import csv @@ -7,13 +7,13 @@ import shutil import subprocess import sys -import urllib +import urllib.request, urllib.parse, urllib.error -import config -import fieldRanking +from . import config +from . import fieldRanking -from postprocess import processdata -from utility import utility +from .postprocess import processdata +from .utility import utility parser = argparse.ArgumentParser(description = "This script searches for all combinations with occurences above the threshold.") parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, @@ -123,7 +123,7 @@ class botClassification(): def prepare(self): result = fieldRanking.fieldRanking(args.month, queryType, args.monthsFolder, ignoreLock = args.ignoreLock, filterParams = args.filter) - for index, (keyOneEntry, keyOneEntryCount) in enumerate(sorted(result.iteritems(), key=lambda (k, v): (v, k), reverse = True)): + for index, (keyOneEntry, keyOneEntryCount) in enumerate(sorted(iter(result.items()), key=lambda k_v1: (k_v1[1], k_v1[0]), reverse = True)): if keyOneEntryCount < args.threshold: break self.queryTypes[keyOneEntry] = dict() @@ -147,9 +147,9 @@ def handle(self, sparqlQuery, processed): queryTypeDict[userAgentEntry].append(sparqlQuery) def threshold(self): - for queryTypeEntry, queryTypeDict in self.queryTypes.items(): + for queryTypeEntry, queryTypeDict in list(self.queryTypes.items()): self.queryTypesCount[queryTypeEntry] = 0 - for userAgentEntry, queries in queryTypeDict.items(): + for userAgentEntry, queries in list(queryTypeDict.items()): numberOfQueries = len(queries) if (numberOfQueries < args.threshold): del queryTypeDict[userAgentEntry] @@ -167,17 +167,17 @@ def writeOut(self): with open(manualCheckupFolder + "readme.md", "w") as readmeFile: print("This directory contains all " + queryType + "-" + userAgent + "-Combinations above a threshold of " + str(args.threshold) + ".", file = readmeFile) print("count\t" + queryType + "\t" + userAgent + "-count", file = readmeFile) - for queryTypeEntry, count in sorted(self.queryTypesCount.iteritems(), key = lambda (k, v): (v, k), reverse = True): + for queryTypeEntry, count in sorted(iter(self.queryTypesCount.items()), key = lambda k_v2: (k_v2[1], k_v2[0]), reverse = True): print(str(count) + "\t" + queryTypeEntry + "\t" + str(len(self.queryTypes[queryTypeEntry])), file = readmeFile) - for queryTypeEntry, queryTypeDict in self.queryTypes.iteritems(): + for queryTypeEntry, queryTypeDict in self.queryTypes.items(): queryTypePath = preparePath(manualCheckupFolder, queryTypeEntry, tooLong) with open(queryTypePath + "info.txt", "w") as infoQueryTypeFile: print("count\t" + userAgent, file = infoQueryTypeFile) - for i, (userAgentEntry, queries) in enumerate(sorted(queryTypeDict.iteritems(), key = lambda (k, v): (len(v), k), reverse = True)): + for i, (userAgentEntry, queries) in enumerate(sorted(iter(queryTypeDict.items()), key = lambda k_v: (len(k_v[1]), k_v[0]), reverse = True)): print(str(len(queries)) + "\t" + userAgentEntry, file = infoQueryTypeFile) @@ -190,7 +190,7 @@ def writeOut(self): with open(pathBase + "newBots.tsv", "w") as newBots, gzip.open(tempForAnonymization + "QueryCnt01.tsv.gz", "w") as forAnonymization: print("queryType\tuserAgent\ttool\tversion\tcomment", file = newBots) print("uri_query\turi_path\tuser_agent\tts\tagent_type\thour\thttp_status", file = forAnonymization) - for queryTypeEntry, queryTypeDict in self.queryTypes.iteritems(): + for queryTypeEntry, queryTypeDict in self.queryTypes.items(): firstUserAgent = None for userAgentEntry in queryTypeDict: if firstUserAgent == None: @@ -198,7 +198,7 @@ def writeOut(self): print(queryTypeEntry + "\t" + userAgentEntry + "\t" + "not set\tnot set\t", file = newBots) if firstUserAgent != None: example = queryTypeDict[firstUserAgent][0] - encoded = urllib.quote_plus(example) + encoded = urllib.parse.quote_plus(example) print("?query=" + encoded + "\tpath\tagent\ttime\ttype\thour\tstatus", file = forAnonymization) queryTypeOrder.append(queryTypeEntry) diff --git a/tools/automatedBotClassification.py.bak b/tools/automatedBotClassification.py.bak new file mode 100644 index 0000000..4d1c43e --- /dev/null +++ b/tools/automatedBotClassification.py.bak @@ -0,0 +1,247 @@ +from __future__ import print_function + +import argparse +import csv +import gzip +import os +import shutil +import subprocess +import sys +import urllib + +import config +import fieldRanking + +from postprocess import processdata +from utility import utility + +parser = argparse.ArgumentParser(description = "This script searches for all combinations with occurences above the threshold.") +parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, help="The folder in which the months directory " + + "are residing.") +parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and execute" + + " anyways", action="store_true") +parser.add_argument("--outputPath", "-o", type=str, help="The path where the " + + "output files should be generated.") +parser.add_argument("--logging", "-l", help="Enables file logging.", + action="store_true") +parser.add_argument("--filter", "-f", default="", type=str, help="Constraints " + + "used to limit the lines used to generate the output." + + " Default filter is Valid=^VALID$." + + " Enter as =,/ (e.g." + + " QueryType=wikidataLastModified,ToolName=^USER$)" + + " NOTE: If you use this option you should probably also" + + " set the --outputPath to some value other than the " + + "default.") +parser.add_argument("month", type=str, help="The month for which the ranking should be generated.") +parser.add_argument("--threshold", "-t", default = 2000, type = int, help = "The threshold above which the combinations should be listed. Default is 2000.") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +monthsFolder = utility.addMissingSlash(args.monthsFolder) +month = utility.addMissingSlash(args.month) + +if os.path.isfile(monthsFolder + month + "locked") \ + and not ignoreLock: + print ("ERROR: The month " + args.month + " is being edited at the moment." + + " Use -i or ignoreLock = True if you want to force the execution of this script.") + sys.exit() + +subfolder = "automatedBotClassification/" + +pathBase = monthsFolder + month + subfolder + +if not os.path.exists(pathBase): + os.makedirs(pathBase) + +preBuildFolder = pathBase + "preBuildQueryTypeFiles/" + +if not os.path.exists(preBuildFolder): + os.makedirs(preBuildFolder) + +tempForAnonymization = pathBase + "rawLogData/" + +if not os.path.exists(tempForAnonymization): + os.makedirs(tempForAnonymization) + +manualCheckupFolder = pathBase + "manualCheckupFolder/" + +if not os.path.exists(manualCheckupFolder): + os.makedirs(manualCheckupFolder) + +if args.outputPath is not None: + pathBase = args.outputPath + +filter = utility.filter() + +filter.setup(args.filter) + +queryType = "QueryType" +userAgent = "user_agent" + +queryTypeOrder = list() + +toolNamesToIgnore = list() + +with open ("../userAgentClassification/toolNameForUserCategory.tsv") as toolNames: + toolReader = csv.DictReader(toolNames, delimiter="\t") + for entry in toolReader: + toolNamesToIgnore.append(entry["tool names to be included in the user source category"]) + +def preparePath(path, directory, i): + replacedDirectory = directory.replace("/", "SLASH") + + if len(replacedDirectory) > 140: + replacedDirectory = replacedDirectory[:140] + str(i) + i += 1 + + fullPath = path + replacedDirectory + "/" + + if not os.path.exists(fullPath): + os.makedirs(fullPath) + + return fullPath + +class anonymizationReader(): + + counter = 0 + + def handle(self, sparqlQuery, processed): + with open(preBuildFolder + queryTypeOrder[self.counter] + ".preBuildQueryType", "w") as queryTypeExample: + print(sparqlQuery, file = queryTypeExample) + self.counter += 1 + + +class botClassification(): + queryTypes = dict() + + queryTypesCount = dict() + + def prepare(self): + result = fieldRanking.fieldRanking(args.month, queryType, args.monthsFolder, ignoreLock = args.ignoreLock, filterParams = args.filter) + for index, (keyOneEntry, keyOneEntryCount) in enumerate(sorted(result.iteritems(), key=lambda (k, v): (v, k), reverse = True)): + if keyOneEntryCount < args.threshold: + break + self.queryTypes[keyOneEntry] = dict() + self.queryTypesCount[keyOneEntry] = keyOneEntryCount + + def handle(self, sparqlQuery, processed): + if not filter.checkLine(processed): + return + + if processed["#ToolName"] in toolNamesToIgnore: + return + + queryTypeEntry = processed["#" + queryType] + if queryTypeEntry not in self.queryTypes: + return + + queryTypeDict = self.queryTypes[queryTypeEntry] + userAgentEntry = processed["#" + userAgent] + if (userAgentEntry not in queryTypeDict): + queryTypeDict[userAgentEntry] = list() + queryTypeDict[userAgentEntry].append(sparqlQuery) + + def threshold(self): + for queryTypeEntry, queryTypeDict in self.queryTypes.items(): + self.queryTypesCount[queryTypeEntry] = 0 + for userAgentEntry, queries in queryTypeDict.items(): + numberOfQueries = len(queries) + if (numberOfQueries < args.threshold): + del queryTypeDict[userAgentEntry] + else: + self.queryTypesCount[queryTypeEntry] += numberOfQueries + + if len(queryTypeDict) == 0: + del self.queryTypes[queryTypeEntry] + del self.queryTypesCount[queryTypeEntry] + + + def writeOut(self): + tooLong = 0 + + with open(manualCheckupFolder + "readme.md", "w") as readmeFile: + print("This directory contains all " + queryType + "-" + userAgent + "-Combinations above a threshold of " + str(args.threshold) + ".", file = readmeFile) + print("count\t" + queryType + "\t" + userAgent + "-count", file = readmeFile) + for queryTypeEntry, count in sorted(self.queryTypesCount.iteritems(), key = lambda (k, v): (v, k), reverse = True): + print(str(count) + "\t" + queryTypeEntry + "\t" + str(len(self.queryTypes[queryTypeEntry])), file = readmeFile) + + for queryTypeEntry, queryTypeDict in self.queryTypes.iteritems(): + + queryTypePath = preparePath(manualCheckupFolder, queryTypeEntry, tooLong) + + with open(queryTypePath + "info.txt", "w") as infoQueryTypeFile: + print("count\t" + userAgent, file = infoQueryTypeFile) + + for i, (userAgentEntry, queries) in enumerate(sorted(queryTypeDict.iteritems(), key = lambda (k, v): (len(v), k), reverse = True)): + + print(str(len(queries)) + "\t" + userAgentEntry, file = infoQueryTypeFile) + + userAgentPath = preparePath(queryTypePath, userAgentEntry, tooLong) + + for i, query in enumerate(queries): + with open(userAgentPath + "{}.query".format(i), "w") as queryFile: + queryFile.write(str(query)) + + with open(pathBase + "newBots.tsv", "w") as newBots, gzip.open(tempForAnonymization + "QueryCnt01.tsv.gz", "w") as forAnonymization: + print("queryType\tuserAgent\ttool\tversion\tcomment", file = newBots) + print("uri_query\turi_path\tuser_agent\tts\tagent_type\thour\thttp_status", file = forAnonymization) + for queryTypeEntry, queryTypeDict in self.queryTypes.iteritems(): + firstUserAgent = None + for userAgentEntry in queryTypeDict: + if firstUserAgent == None: + firstUserAgent = userAgentEntry + print(queryTypeEntry + "\t" + userAgentEntry + "\t" + "not set\tnot set\t", file = newBots) + if firstUserAgent != None: + example = queryTypeDict[firstUserAgent][0] + encoded = urllib.quote_plus(example) + print("?query=" + encoded + "\tpath\tagent\ttime\ttype\thour\tstatus", file = forAnonymization) + queryTypeOrder.append(queryTypeEntry) + + #print("---------------------------\n") + #print(queryTypeEntry + "\n") + #print(example + "\n") + + mavenCall = ['mvn', 'exec:java@Anonymizer'] + + mavenArguments = '-Dexec.args=-w ' + pathBase + if args.logging: + mavenArguments += " -l" + mavenCall.append(mavenArguments) + + owd = os.getcwd() + os.chdir("..") + + print("Starting anonymization of pre build query types for " + args.month + ".") + + if subprocess.call(['mvn', 'clean', 'package']) != 0: + print("ERROR: Could not package the java application.") + sys.exit(1) + + if subprocess.call(mavenCall) != 0: + print("ERROR: Could not execute the java application. Check the logs " + + "for details or rerun this script with -l to generate logs.") + sys.exit(1) + + os.chdir(owd) + + readerHandler = anonymizationReader() + + processdata.processDayAnonymous(readerHandler, 1, subfolder, monthsFolder + month) + +handler = botClassification() + +handler.prepare() + +processdata.processMonth(handler, args.month, args.monthsFolder) + +handler.threshold() + +handler.writeOut() + +shutil.rmtree(tempForAnonymization) +shutil.rmtree(pathBase + "anonymousRawData/") diff --git a/tools/botClassificationHelper.py b/tools/botClassificationHelper.py index 8484af5..2551c61 100644 --- a/tools/botClassificationHelper.py +++ b/tools/botClassificationHelper.py @@ -1,14 +1,14 @@ -from __future__ import print_function + import argparse import os import sys -import config -import fieldRanking +from . import config +from . import fieldRanking -from postprocess import processdata -from utility import utility +from .postprocess import processdata +from .utility import utility parser = argparse.ArgumentParser(description = "This script searches for all queries for the top N query types and their top N user agents.") parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, @@ -88,7 +88,7 @@ class botClassification(): def prepare(self): result = fieldRanking.fieldRanking(args.month, key1, args.monthsFolder, ignoreLock = args.ignoreLock, filterParams = args.filter) - for i, (k, v) in enumerate(sorted(result.iteritems(), key=lambda (k, v): (v, k), reverse = True)): + for i, (k, v) in enumerate(sorted(iter(result.items()), key=lambda k_v1: (k_v1[1], k_v1[0]), reverse = True)): self.actualNumber = i if i >= args.numberOfCombinations: break @@ -115,17 +115,17 @@ def writeOut(self): with open(pathBase + "readme.md", "w") as readmeFile: print("This directory contains all top {}".format(self.actualNumber) + " " + key1 + "-" + key2 + "-Combinations.", file = readmeFile) print("count\t" + key1, file = readmeFile) - for firstKey, count in sorted(self.firstKeysCount.iteritems(), key = lambda (k, v): (v, k), reverse = True): + for firstKey, count in sorted(iter(self.firstKeysCount.items()), key = lambda k_v2: (k_v2[1], k_v2[0]), reverse = True): print(str(count) + "\t" + firstKey, file = readmeFile) - for firstKey, secondKeyDict in self.firstKeys.iteritems(): + for firstKey, secondKeyDict in self.firstKeys.items(): firstKeyPath = preparePath(pathBase, firstKey, tooLong) with open(firstKeyPath + "info.txt", "w") as infoFirstKeyFile: print("count\t" + key2, file = infoFirstKeyFile) - for i, (secondKey, queries) in enumerate(sorted(secondKeyDict.iteritems(), key = lambda (k, v): (len(v), k), reverse = True)): + for i, (secondKey, queries) in enumerate(sorted(iter(secondKeyDict.items()), key = lambda k_v: (len(k_v[1]), k_v[0]), reverse = True)): if i >= args.numberOfCombinations: break diff --git a/tools/botClassificationHelper.py.bak b/tools/botClassificationHelper.py.bak new file mode 100644 index 0000000..8484af5 --- /dev/null +++ b/tools/botClassificationHelper.py.bak @@ -0,0 +1,151 @@ +from __future__ import print_function + +import argparse +import os +import sys + +import config +import fieldRanking + +from postprocess import processdata +from utility import utility + +parser = argparse.ArgumentParser(description = "This script searches for all queries for the top N query types and their top N user agents.") +parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, help="The folder in which the months directory " + + "are residing.") +parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and execute" + + " anyways", action="store_true") +parser.add_argument("--outputPath", "-o", type=str, help="The path where the " + + "output files should be generated.") +parser.add_argument("--filter", "-f", default="", type=str, help="Constraints " + + "used to limit the lines used to generate the output." + + " Default filter is Valid=^VALID$." + + " Enter as =,/ (e.g." + + " QueryType=wikidataLastModified,ToolName=^USER$)" + + " NOTE: If you use this option you should probably also" + + " set the --outputPath to some value other than the " + + "default.") +parser.add_argument("--numberOfCombinations", "-n", type=int, help="The number N for which combinations should be generated." + + " Default is 40.", default = 40) +parser.add_argument("--switchKeys", "-s", help="Switch to searching for top N user agents and their top N query types.", action="store_true") +parser.add_argument("month", type=str, + help="The month for which the ranking should be " + +"generated.") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +monthsFolder = utility.addMissingSlash(args.monthsFolder) +month = utility.addMissingSlash(args.month) + +if os.path.isfile(monthsFolder + month + "locked") \ + and not ignoreLock: + print ("ERROR: The month " + args.month + " is being edited at the moment." + + " Use -i or ignoreLock = True if you want to force the execution of this script.") + sys.exit() + +pathBase = monthsFolder + month + "botClassificationHelper/" + +if args.outputPath is not None: + pathBase = args.outputPath + +filter = utility.filter() + +filter.setup(args.filter) + +key1 = "QueryType" + +key2 = "user_agent" + +if args.switchKeys: + key1, key2 = key2, key1 + +def preparePath(path, directory, i): + replacedDirectory = directory.replace("/", "SLASH") + + if len(replacedDirectory) > 140: + replacedDirectory = replacedDirectory[:140] + str(i) + i += 1 + + fullPath = path + replacedDirectory + "/" + + if not os.path.exists(fullPath): + os.makedirs(fullPath) + + return fullPath + +class botClassification(): + + firstKeys = dict() + + firstKeysCount = dict() + + actualNumber = 0 + + def prepare(self): + result = fieldRanking.fieldRanking(args.month, key1, args.monthsFolder, ignoreLock = args.ignoreLock, filterParams = args.filter) + for i, (k, v) in enumerate(sorted(result.iteritems(), key=lambda (k, v): (v, k), reverse = True)): + self.actualNumber = i + if i >= args.numberOfCombinations: + break + self.firstKeys[k] = dict() + self.firstKeysCount[k] = v + + def handle(self, sparqlQuery, processed): + if not filter.checkLine(processed): + return + + firstKey = processed["#" + key1] + if firstKey not in self.firstKeys: + return + + firstKeyDict = self.firstKeys[firstKey] + secondKey = processed["#" + key2] + if secondKey not in firstKeyDict: + firstKeyDict[secondKey] = list() + firstKeyDict[secondKey].append(sparqlQuery) + + def writeOut(self): + tooLong = 0 + + with open(pathBase + "readme.md", "w") as readmeFile: + print("This directory contains all top {}".format(self.actualNumber) + " " + key1 + "-" + key2 + "-Combinations.", file = readmeFile) + print("count\t" + key1, file = readmeFile) + for firstKey, count in sorted(self.firstKeysCount.iteritems(), key = lambda (k, v): (v, k), reverse = True): + print(str(count) + "\t" + firstKey, file = readmeFile) + + for firstKey, secondKeyDict in self.firstKeys.iteritems(): + + firstKeyPath = preparePath(pathBase, firstKey, tooLong) + + with open(firstKeyPath + "info.txt", "w") as infoFirstKeyFile: + print("count\t" + key2, file = infoFirstKeyFile) + + for i, (secondKey, queries) in enumerate(sorted(secondKeyDict.iteritems(), key = lambda (k, v): (len(v), k), reverse = True)): + if i >= args.numberOfCombinations: + break + + print(str(len(queries)) + "\t" + secondKey, file = infoFirstKeyFile) + + secondKeyPath = preparePath(firstKeyPath, secondKey, tooLong) + + for i, query in enumerate(queries): + with open(secondKeyPath + "{}.query".format(i), "w") as queryFile: + queryFile.write(str(query)) + +handler = botClassification() + +handler.prepare() + +processdata.processMonth(handler, args.month, args.monthsFolder) + +if not os.path.exists(pathBase): + os.makedirs(pathBase) + +handler.writeOut() + + diff --git a/tools/countAnything.py b/tools/countAnything.py index a82a8ae..8866bff 100644 --- a/tools/countAnything.py +++ b/tools/countAnything.py @@ -2,11 +2,11 @@ # -*- coding: utf-8 -*- import argparse -from postprocess import processdata -import config +from .postprocess import processdata +from . import config import os import sys -from utility import utility +from .utility import utility import operator os.nice(19) @@ -30,7 +30,7 @@ args = parser.parse_args() if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") and not args.ignoreLock: - print "ERROR: The month " + args.month + " is being edited at the moment. Use -i if you want to force the execution of this script." + print("ERROR: The month " + args.month + " is being edited at the moment. Use -i if you want to force the execution of this script.") sys.exit() @@ -55,10 +55,10 @@ def handle(self,sparqlQuery,processed): self.propQueryCounts[prop] = c def printResults(self): - print "Queries: %d" % (self.queryCount) - print "\n\n%s\tcount" % (args.parameter) - for p, c in sorted(self.propQueryCounts.iteritems(), key=operator.itemgetter(1), reverse=True): - print "%s\t%d" % (p,c) + print("Queries: %d" % (self.queryCount)) + print("\n\n%s\tcount" % (args.parameter)) + for p, c in sorted(iter(self.propQueryCounts.items()), key=operator.itemgetter(1), reverse=True): + print("%s\t%d" % (p,c)) handler = CountRdfPropertiesHandler() processdata.processMonth(handler, args.month, args.monthsFolder) diff --git a/tools/countAnything.py.bak b/tools/countAnything.py.bak new file mode 100644 index 0000000..a82a8ae --- /dev/null +++ b/tools/countAnything.py.bak @@ -0,0 +1,66 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import argparse +from postprocess import processdata +import config +import os +import sys +from utility import utility +import operator + +os.nice(19) + +parser = argparse.ArgumentParser( + description="Tool to sum up the uses of RDF properties in queries") +parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, help="The folder in which the months directory are residing.") +parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and " + + "execute anyways", action="store_true") +parser.add_argument('--onlyValid', "-o", action='store_true', help="If set " + + "only valid lines are being looked at") +parser.add_argument("month", type=str, help="The month from which lines " + + "should be displayed.") +parser.add_argument("parameter", type=str, help="The parameter by which to group.") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") and not args.ignoreLock: + print "ERROR: The month " + args.month + " is being edited at the moment. Use -i if you want to force the execution of this script." + sys.exit() + + +class CountRdfPropertiesHandler: + queryCount = 0 + propQueryCounts = {} + + def handle(self,sparqlQuery,processed): + self.queryCount += 1 + + if args.onlyValid: + if processed['#Valid'] is not 'VALID': + return + + props = processed[args.parameter].split(",") + + for prop in props: + if prop in self.propQueryCounts: + c = self.propQueryCounts[prop] + 1 + else: + c = 1 + self.propQueryCounts[prop] = c + + def printResults(self): + print "Queries: %d" % (self.queryCount) + print "\n\n%s\tcount" % (args.parameter) + for p, c in sorted(self.propQueryCounts.iteritems(), key=operator.itemgetter(1), reverse=True): + print "%s\t%d" % (p,c) + +handler = CountRdfPropertiesHandler() +processdata.processMonth(handler, args.month, args.monthsFolder) + +handler.printResults() diff --git a/tools/countRdfProperties.py b/tools/countRdfProperties.py index 10721d5..eb70c5f 100644 --- a/tools/countRdfProperties.py +++ b/tools/countRdfProperties.py @@ -2,11 +2,11 @@ # -*- coding: utf-8 -*- import argparse -from postprocess import processdata -import config +from .postprocess import processdata +from . import config import os import sys -from utility import utility +from .utility import utility import operator os.nice(19) @@ -29,7 +29,7 @@ args = parser.parse_args() if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") and not args.ignoreLock: - print "ERROR: The month " + args.month + " is being edited at the moment. Use -i if you want to force the execution of this script." + print("ERROR: The month " + args.month + " is being edited at the moment. Use -i if you want to force the execution of this script.") sys.exit() @@ -54,10 +54,10 @@ def handle(self,sparqlQuery,processed): self.propQueryCounts[prop] = c def printResults(self): - print "Queries: %d" % (self.queryCount) - print "\n\nproperty\tcount" - for p, c in sorted(self.propQueryCounts.iteritems(), key=operator.itemgetter(1), reverse=True): - print "%s\t%d" % (p,c) + print("Queries: %d" % (self.queryCount)) + print("\n\nproperty\tcount") + for p, c in sorted(iter(self.propQueryCounts.items()), key=operator.itemgetter(1), reverse=True): + print("%s\t%d" % (p,c)) handler = CountRdfPropertiesHandler() processdata.processMonth(handler, args.month, args.monthsFolder) diff --git a/tools/countRdfProperties.py.bak b/tools/countRdfProperties.py.bak new file mode 100644 index 0000000..10721d5 --- /dev/null +++ b/tools/countRdfProperties.py.bak @@ -0,0 +1,65 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import argparse +from postprocess import processdata +import config +import os +import sys +from utility import utility +import operator + +os.nice(19) + +parser = argparse.ArgumentParser( + description="Tool to sum up the uses of RDF properties in queries") +parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, help="The folder in which the months directory are residing.") +parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and " + + "execute anyways", action="store_true") +parser.add_argument('--onlyValid', "-o", action='store_true', help="If set " + + "only valid lines are being looked at") +parser.add_argument("month", type=str, help="The month from which lines " + + "should be displayed.") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") and not args.ignoreLock: + print "ERROR: The month " + args.month + " is being edited at the moment. Use -i if you want to force the execution of this script." + sys.exit() + + +class CountRdfPropertiesHandler: + queryCount = 0 + propQueryCounts = {} + + def handle(self,sparqlQuery,processed): + self.queryCount += 1 + + if args.onlyValid: + if processed['#Valid'] is not 'VALID': + return + + props = processed["#Predicates"].split(",") + + for prop in props: + if prop in self.propQueryCounts: + c = self.propQueryCounts[prop] + 1 + else: + c = 1 + self.propQueryCounts[prop] = c + + def printResults(self): + print "Queries: %d" % (self.queryCount) + print "\n\nproperty\tcount" + for p, c in sorted(self.propQueryCounts.iteritems(), key=operator.itemgetter(1), reverse=True): + print "%s\t%d" % (p,c) + +handler = CountRdfPropertiesHandler() +processdata.processMonth(handler, args.month, args.monthsFolder) + +handler.printResults() diff --git a/tools/countTools.py b/tools/countTools.py index 61a190c..0915f34 100644 --- a/tools/countTools.py +++ b/tools/countTools.py @@ -3,9 +3,9 @@ import pprint import sys from collections import defaultdict -from postprocess import processdata -from utility import utility -import config +from .postprocess import processdata +from .utility import utility +from . import config parser = argparse.ArgumentParser( description="Counts the used tools/bots in the given folder") @@ -27,7 +27,7 @@ if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") \ and not args.ignoreLock: - print "ERROR: The month " + args.month + " is being edited at the moment." + print("ERROR: The month " + args.month + " is being edited at the moment.") + " Use -i if you want to force the execution of this script." sys.exit() @@ -40,7 +40,7 @@ def handle(self, sparqlQuery, processed): self.toolCounter[processed['#ToolName']] += 1 def __str__(self): - return pprint.pformat(sorted(self.toolCounter.iteritems(), + return pprint.pformat(sorted(iter(self.toolCounter.items()), key=lambda x: x[1], reverse=True)) @@ -48,4 +48,4 @@ def __str__(self): processdata.processMonth(handler, args.month, args.monthsFolder) -print handler +print(handler) diff --git a/tools/countTools.py.bak b/tools/countTools.py.bak new file mode 100644 index 0000000..61a190c --- /dev/null +++ b/tools/countTools.py.bak @@ -0,0 +1,51 @@ +import argparse +import os +import pprint +import sys +from collections import defaultdict +from postprocess import processdata +from utility import utility +import config + +parser = argparse.ArgumentParser( + description="Counts the used tools/bots in the given folder") +parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, help="the folder in which the months" + + " directory are residing") +parser.add_argument("--ignoreLock", "-i", + help="Ignore locked file and execute anyways", + action="store_true") +parser.add_argument("month", type=str, + help="the month which we're interested in") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + + utility.addMissingSlash(args.month) + "locked") \ + and not args.ignoreLock: + print "ERROR: The month " + args.month + " is being edited at the moment." + + " Use -i if you want to force the execution of this script." + sys.exit() + + +class CountToolsHandler: + toolCounter = defaultdict(int) + + def handle(self, sparqlQuery, processed): + if (processed['#Valid'] == 'VALID' or processed['#Valid'] == '1'): + self.toolCounter[processed['#ToolName']] += 1 + + def __str__(self): + return pprint.pformat(sorted(self.toolCounter.iteritems(), + key=lambda x: x[1], reverse=True)) + + +handler = CountToolsHandler() + +processdata.processMonth(handler, args.month, args.monthsFolder) + +print handler diff --git a/tools/countValid.py b/tools/countValid.py index bb055da..6a22845 100644 --- a/tools/countValid.py +++ b/tools/countValid.py @@ -2,10 +2,10 @@ import os import sys from collections import defaultdict -from postprocess import processdata -from utility import utility +from .postprocess import processdata +from .utility import utility import pprint -import config +from . import config parser = argparse.ArgumentParser(description="Counts the valid queries") parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, @@ -26,8 +26,8 @@ if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") \ and not args.ignoreLock: - print("ERROR: The month " + args.month + " is being edited at the moment." - + " Use -i if you want to force the execution of this script.") + print(("ERROR: The month " + args.month + " is being edited at the moment." + + " Use -i if you want to force the execution of this script.")) sys.exit() diff --git a/tools/countValid.py.bak b/tools/countValid.py.bak new file mode 100644 index 0000000..bb055da --- /dev/null +++ b/tools/countValid.py.bak @@ -0,0 +1,48 @@ +import argparse +import os +import sys +from collections import defaultdict +from postprocess import processdata +from utility import utility +import pprint +import config + +parser = argparse.ArgumentParser(description="Counts the valid queries") +parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, help="the folder in which the months directory" + + " are residing") +parser.add_argument("--ignoreLock", "-i", + help="Ignore locked file and execute anyways", + action="store_true") +parser.add_argument("month", type=str, + help="the month which we're interested in") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + + utility.addMissingSlash(args.month) + "locked") \ + and not args.ignoreLock: + print("ERROR: The month " + args.month + " is being edited at the moment." + + " Use -i if you want to force the execution of this script.") + sys.exit() + + +class CountValidityHandler: + validCounter = defaultdict(int) + + def handle(self, sparqlQuery, processed): + self.validCounter[processed['#Valid']] += 1 + + def __str__(self): + return pprint.pformat(self.validCounter) + + +handler = CountValidityHandler() + +processdata.processMonth(handler, args.month, args.monthsFolder) + +print(handler) diff --git a/tools/createTestData.py b/tools/createTestData.py index 9b6d350..5a35903 100644 --- a/tools/createTestData.py +++ b/tools/createTestData.py @@ -1,12 +1,12 @@ import argparse import os import sys -from postprocess import processdata -from utility import utility -import config +from .postprocess import processdata +from .utility import utility +from . import config import glob import gzip -from itertools import izip + from random import random parser = argparse.ArgumentParser( @@ -35,7 +35,7 @@ if os.path.isfile(utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "locked") \ and not args.ignoreLock: - print "ERROR: The month " + args.month + " is being edited at the moment." + print("ERROR: The month " + args.month + " is being edited at the moment.") + " Use -i if you want to force the execution of this script." sys.exit() @@ -51,7 +51,7 @@ processedFileName = utility.addMissingSlash(monthsFolder) + month \ + "/processedLogData/" + processdata.processedPrefix \ + "%02d" % day + processdata.processedSuffix - print "Working on: " + processedFileName + print("Working on: " + processedFileName) with gzip.open(processedFileName) as p, \ gzip.open("testData/processedLogData/" + @@ -63,12 +63,12 @@ chancesSelected = float(args.lines) / float(sum(1 for line in p)) - print "Done counting lines, chance for selection is " \ - + str(chancesSelected) + print("Done counting lines, chance for selection is " \ + + str(chancesSelected)) headerRow = True - for processed, source in izip(gzip.open(processedFileName), + for processed, source in zip(gzip.open(processedFileName), gzip.open(monthsFolder + "/" + month + "/rawLogData/" + processdata.sourcePrefix + diff --git a/tools/createTestData.py.bak b/tools/createTestData.py.bak new file mode 100644 index 0000000..9b6d350 --- /dev/null +++ b/tools/createTestData.py.bak @@ -0,0 +1,82 @@ +import argparse +import os +import sys +from postprocess import processdata +from utility import utility +import config +import glob +import gzip +from itertools import izip +from random import random + +parser = argparse.ArgumentParser( + description="Creates a smaller testdata set for developing of the" + + " current querys in this folder") +parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, help="the folder in which the months" + + " directory are residing") +parser.add_argument("--ignoreLock", "-i", + help="Ignore locked file and execute anyways", + action="store_true") +parser.add_argument("month", type=str, + help="the month which we're interested in") +parser.add_argument("lines", type=int, + help="number of lines the testfiles should have") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() +monthsFolder = args.monthsFolder +month = args.month + + +if os.path.isfile(utility.addMissingSlash(monthsFolder) + + utility.addMissingSlash(month) + "locked") \ + and not args.ignoreLock: + print "ERROR: The month " + args.month + " is being edited at the moment." + + " Use -i if you want to force the execution of this script." + sys.exit() + +# create new folder for the test data +os.makedirs("testData/processedLogData") +os.makedirs("testData/rawLogData") + +for filename in glob.glob(monthsFolder + "/" + month + + "/processedLogData/" + processdata.processedPrefix + + "*" + processdata.processedSuffix): + day = int(os.path.basename(filename)[len(processdata.processedPrefix):] + [:-len(processdata.processedSuffix)]) + processedFileName = utility.addMissingSlash(monthsFolder) + month \ + + "/processedLogData/" + processdata.processedPrefix \ + + "%02d" % day + processdata.processedSuffix + print "Working on: " + processedFileName + + with gzip.open(processedFileName) as p, \ + gzip.open("testData/processedLogData/" + + processdata.processedPrefix + "%02d" % day + + ".tsv.gz", "wb") as pc,\ + gzip.open("testData/rawLogData/" + + processdata.sourcePrefix + + "%02d" % day + ".tsv.gz", "wb") as sc: + + chancesSelected = float(args.lines) / float(sum(1 for line in p)) + + print "Done counting lines, chance for selection is " \ + + str(chancesSelected) + + headerRow = True + + for processed, source in izip(gzip.open(processedFileName), + gzip.open(monthsFolder + "/" + month + + "/rawLogData/" + + processdata.sourcePrefix + + "%02d" % day + ".tsv.gz")): + if(random() >= chancesSelected and not headerRow): + continue + elif headerRow: + headerRow = False + + pc.write(processed) + sc.write(source) diff --git a/tools/exampleDatasetGenerator.py b/tools/exampleDatasetGenerator.py index 1c86792..c711cec 100644 --- a/tools/exampleDatasetGenerator.py +++ b/tools/exampleDatasetGenerator.py @@ -1,13 +1,13 @@ -from __future__ import print_function + import argparse import glob import gzip import random import sys -import urllib +import urllib.request, urllib.parse, urllib.error -from utility import utility +from .utility import utility parser = argparse.ArgumentParser(description = "This script generates an example raw log file based on a folder with example queries") parser.add_argument("--exampleQueryFolder", "-e", default="", @@ -34,7 +34,7 @@ for filename in glob.glob(exampleQueryFolder + "*.exampleQuery"): with open(filename) as exampleFile: - line = "?query=" + urllib.quote_plus(exampleFile.read()) + "\t" + line = "?query=" + urllib.parse.quote_plus(exampleFile.read()) + "\t" line += random.sample(uri_path, 1)[0] + "\t" line += random.sample(user_agent, 1)[0] + "\t" line += random.sample(timestamp, 1)[0] + "\t" diff --git a/tools/exampleDatasetGenerator.py.bak b/tools/exampleDatasetGenerator.py.bak new file mode 100644 index 0000000..1c86792 --- /dev/null +++ b/tools/exampleDatasetGenerator.py.bak @@ -0,0 +1,44 @@ +from __future__ import print_function + +import argparse +import glob +import gzip +import random +import sys +import urllib + +from utility import utility + +parser = argparse.ArgumentParser(description = "This script generates an example raw log file based on a folder with example queries") +parser.add_argument("--exampleQueryFolder", "-e", default="", + type=str, help="The folder in which the example queries are residing.") +parser.add_argument("--outputDirectory", "-o", default="", + type=str, help="The directory in which the QueryCnt01.tsv.gz-file should be created.") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +uri_path = {"/sparql", "/bigdata/namespace/wdq/sparql"} +user_agent = {"Mozilla/5.0 (Android 4.4; Mobile; rv:41.0) Gecko/41.0 Firefox/41.0", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0"} +timestamp = {"2017-07-01 01:31:14", "2017-07-01 01:32:54", "2017-07-01 01:34:10"} +agent_type = {"spider", "user"} +http_status = "200" + +with gzip.open(utility.addMissingSlash(args.outputDirectory) + "QueryCnt01.tsv.gz", "w") as target: + print("uri_query\turi_path\tuser_agent\tts\tagent_type\thour\thttp_status", file = target) + + exampleQueryFolder = utility.addMissingSlash(args.exampleQueryFolder) + + for filename in glob.glob(exampleQueryFolder + "*.exampleQuery"): + with open(filename) as exampleFile: + line = "?query=" + urllib.quote_plus(exampleFile.read()) + "\t" + line += random.sample(uri_path, 1)[0] + "\t" + line += random.sample(user_agent, 1)[0] + "\t" + line += random.sample(timestamp, 1)[0] + "\t" + line += random.sample(agent_type, 1)[0] + "\t" + line += str(random.randint(0,23)) + "\t" + line += http_status + print(line, file = target) diff --git a/tools/extract.py b/tools/extract.py index d1265f9..bfc067b 100644 --- a/tools/extract.py +++ b/tools/extract.py @@ -1,12 +1,12 @@ import argparse -import config +from . import config import csv import gzip import os import sys -from itertools import izip -from utility import utility + +from .utility import utility os.nice(19) @@ -165,15 +165,15 @@ def write(self, processed, source): monthFolder = utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(month) if os.path.isfile(monthFolder + "locked") and not args.ignoreLock: - print "ERROR: The month " + month + " is being edited at the moment. Use -i if you want to force the execution of this script." + print("ERROR: The month " + month + " is being edited at the moment. Use -i if you want to force the execution of this script.") sys.exit() - for i in xrange(1, 32): + for i in range(1, 32): processed = monthFolder + config.processedPrefix + "%02d" % i + ".tsv.gz" source = monthFolder + config.sourcePrefix + "%02d" % i + ".tsv.gz" if not (os.path.exists(processed) and gzip.os.path.exists(source)): continue - print "Working on %02d" % i + print("Working on %02d" % i) with gzip.open(processed) as p, gzip.open(source) as s: pReader = csv.DictReader(p, delimiter="\t") sReader = csv.DictReader(s, delimiter="\t") @@ -181,6 +181,6 @@ def write(self, processed, source): for dataset in datasets: dataset.open(pReader, sReader, i) - for processed, source in izip(pReader, sReader): + for processed, source in zip(pReader, sReader): for dataset in datasets: dataset.write(processed, source) diff --git a/tools/extract.py.bak b/tools/extract.py.bak new file mode 100644 index 0000000..d1265f9 --- /dev/null +++ b/tools/extract.py.bak @@ -0,0 +1,186 @@ +import argparse +import config +import csv +import gzip +import os +import sys + +from itertools import izip +from utility import utility + +os.nice(19) + +class simpleDataset(object): + subfolder = "simpleDataset/" + p = None + s = None + pWriter = None + sWriter = None + + def open(self, pReader, sReader, day): + if self.p != None: + self.p.close() + if self.s != None: + self.s.close() + processedFolder = monthFolder + self.subfolder + config.processedFolderName + if not os.path.exists(processedFolder): + os.makedirs(processedFolder) + + sourceFolder = monthFolder + self.subfolder + config.sourceFolderName + if not os.path.exists(sourceFolder): + os.makedirs(sourceFolder) + + self.p = gzip.open(processedFolder + config.processedFilePrefix + "%02d" % day + ".tsv.gz", "w") + self.s = gzip.open(sourceFolder + config.sourceFilePrefix + "%02d" % day + ".tsv.gz", "w") + + self.pWriter = csv.DictWriter(self.p, None, delimiter="\t") + self.sWriter = csv.DictWriter(self.s, None, delimiter="\t") + + if self.pWriter.fieldnames is None: + ph = dict((h, h) for h in pReader.fieldnames) + self.pWriter.fieldnames = pReader.fieldnames + self.pWriter.writerow(ph) + + if self.sWriter.fieldnames is None: + sh = dict((h, h) for h in sReader.fieldnames) + self.sWriter.fieldnames = sReader.fieldnames + self.sWriter.writerow(sh) + + def write(self, processed, source): + self.pWriter.writerow(processed) + self.sWriter.writerow(source) + + def close(self): + self.pWriter.close() + self.sWriter.close() + +class uniqueDataset(simpleDataset): + subfolder = "uniqueDataset/" + + def write(self, processed, source): + if (processed['First'] == "FIRST"): + self.pWriter.writerow(processed) + self.sWriter.writerow(source) + +class userDataset(simpleDataset): + subfolder = "userData/" + + def write(self, processed, source): + if (processed["SourceCategory"] == "USER"): + self.pWriter.writerow(processed) + self.sWriter.writerow(source) + +class nonUserDataset(simpleDataset): + subfolder = "nonUserData/" + + def write(self, processed, source): + if (processed["SourceCategory"] != "USER"): + self.pWriter.writerow(processed) + self.sWriter.writerow(source) + +class status2xxDataset(simpleDataset): + subfolder = "status2xx/" + + def write(self, processed, source): + if (source["http_status"].startswith("2")): + self.pWriter.writerow(processed) + self.sWriter.writerow(source) + +class status500Dataset(simpleDataset): + subfolder = "status500/" + + def write(self, processed, source): + if (source["http_status"] == ("500")): + self.pWriter.writerow(processed) + self.sWriter.writerow(source) + +parser = argparse.ArgumentParser( + description="Creates subsets of the raw and processed log files depending on choosen criteria." +) +parser.add_argument( + "--monthsFolder", + "-m", + default=config.monthsFolder, + type=str, + help="The folder in which the month directories are residing." +) +parser.add_argument( + "--ignoreLock", + "-i", + help="Ignore locked file and execute anyways", + action="store_true" +) +parser.add_argument( + "--uniqueDataset", + "-q", + help="A subset containing each unique query exactly once.", + action="store_true" +) +parser.add_argument( + "--userDataset", + "-u", + help="A subset containing only queries posed by users.", + action="store_true" +) +parser.add_argument( + "--nonUserDataset", + "-n", + help="A subset containing only queries posed by non-users.", + action="store_true" +) +parser.add_argument( + "--status2xxDataset", + "-s2", + help="A subset containing only queries with http status 2xx.", + action="store_true" +) +parser.add_argument( + "--status500Dataset", + "-s5", + help="A subset containing only queries with http status 500.", + action="store_true" +) +parser.add_argument("months", type=str, help="The months of which subsets should be generated, separated by comma ',' if necessary.") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +datasets = list() + +if args.uniqueDataset: + datasets.append(uniqueDataset()) +if args.userDataset: + datasets.append(userDataset()) +if args.nonUserDataset: + datasets.append(nonUserDataset()) +if args.status2xxDataset: + datasets.append(status2xxDataset()) +if args.status500Dataset: + datasets.append(status500Dataset()) + +for month in args.months.split(","): + monthFolder = utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(month) + + if os.path.isfile(monthFolder + "locked") and not args.ignoreLock: + print "ERROR: The month " + month + " is being edited at the moment. Use -i if you want to force the execution of this script." + sys.exit() + + for i in xrange(1, 32): + processed = monthFolder + config.processedPrefix + "%02d" % i + ".tsv.gz" + source = monthFolder + config.sourcePrefix + "%02d" % i + ".tsv.gz" + if not (os.path.exists(processed) and gzip.os.path.exists(source)): + continue + print "Working on %02d" % i + with gzip.open(processed) as p, gzip.open(source) as s: + pReader = csv.DictReader(p, delimiter="\t") + sReader = csv.DictReader(s, delimiter="\t") + + for dataset in datasets: + dataset.open(pReader, sReader, i) + + for processed, source in izip(pReader, sReader): + for dataset in datasets: + dataset.write(processed, source) diff --git a/tools/extractCachedData.py b/tools/extractCachedData.py index 738d54e..408cc6d 100644 --- a/tools/extractCachedData.py +++ b/tools/extractCachedData.py @@ -7,10 +7,10 @@ import sys from dateutil.parser import parse -from itertools import izip -import config -from utility import utility + +from . import config +from .utility import utility parser = argparse.ArgumentParser( description="Creates two subsets of the raw log files and the processed log files that would have been cached / not been cached.") @@ -30,7 +30,7 @@ args = parser.parse_args() if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") and not args.ignoreLock: - print "ERROR: The month " + args.month + " is being edited at the moment. Use -i if you want to force the execution of this script." + print("ERROR: The month " + args.month + " is being edited at the moment. Use -i if you want to force the execution of this script.") sys.exit() os.chdir(utility.addMissingSlash(args.monthsFolder) @@ -54,11 +54,11 @@ valueErrors = list() -for i in xrange(1, 32): +for i in range(1, 32): if not (os.path.exists(processedPrefix + "%02d" % i + ".tsv.gz") and gzip.os.path.exists(sourcePrefix + "%02d" % i + ".tsv.gz")): continue - print "Working on %02d" % i + print("Working on %02d" % i) with gzip.open(processedPrefix + "%02d" % i + ".tsv.gz") as p, \ gzip.open(sourcePrefix + "%02d" % i + ".tsv.gz") as s, \ gzip.open(subfolderCached + processedPrefix + "%02d" % i + ".tsv.gz", "w") as cached_p, \ @@ -78,7 +78,7 @@ lasttime = None - for processed, source in izip(pReader, sReader): + for processed, source in zip(pReader, sReader): if cachedpWriter.fieldnames is None: ph = dict((h, h) for h in pReader.fieldnames) cachedpWriter.fieldnames = pReader.fieldnames @@ -113,7 +113,7 @@ uncachedsWriter.writerow(source) else: if (timestamp - lasttime).total_seconds() >= 60: - for k, v in cache.items(): + for k, v in list(cache.items()): if (timestamp - v).total_seconds() / 60 > 5.0: del cache[k] if uri_query in cache: @@ -133,6 +133,6 @@ cache[uri_query] = timestamp if len(valueErrors) > 0: - print "Value errors for time stamps:" + print("Value errors for time stamps:") for error in valueErrors: - print error + print(error) diff --git a/tools/extractCachedData.py.bak b/tools/extractCachedData.py.bak new file mode 100644 index 0000000..738d54e --- /dev/null +++ b/tools/extractCachedData.py.bak @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- + +import argparse +import csv +import gzip +import os +import sys + +from dateutil.parser import parse +from itertools import izip + +import config +from utility import utility + +parser = argparse.ArgumentParser( + description="Creates two subsets of the raw log files and the processed log files that would have been cached / not been cached.") +parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, help="the folder in which the months directory " + + "are residing") +parser.add_argument("--ignoreLock", "-i", + help="Ignore locked file and execute anyways", + action="store_true") +parser.add_argument("month", type=str, + help="the month which we're interested in") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") and not args.ignoreLock: + print "ERROR: The month " + args.month + " is being edited at the moment. Use -i if you want to force the execution of this script." + sys.exit() + +os.chdir(utility.addMissingSlash(args.monthsFolder) + + utility.addMissingSlash(args.month)) + +subfolderCached = "cachedData/" +subfolderUncached = "uncachedData/" + +processedPrefix = config.processedPrefix +sourcePrefix = config.sourcePrefix + +if not os.path.exists(subfolderCached): + os.makedirs(subfolderCached) + os.makedirs(subfolderCached + "processedLogData") + os.makedirs(subfolderCached + "rawLogData") + +if not os.path.exists(subfolderUncached): + os.makedirs(subfolderUncached) + os.makedirs(subfolderUncached + "processedLogData") + os.makedirs(subfolderUncached + "rawLogData") + +valueErrors = list() + +for i in xrange(1, 32): + if not (os.path.exists(processedPrefix + "%02d" % i + ".tsv.gz") + and gzip.os.path.exists(sourcePrefix + "%02d" % i + ".tsv.gz")): + continue + print "Working on %02d" % i + with gzip.open(processedPrefix + "%02d" % i + ".tsv.gz") as p, \ + gzip.open(sourcePrefix + "%02d" % i + ".tsv.gz") as s, \ + gzip.open(subfolderCached + processedPrefix + "%02d" % i + ".tsv.gz", "w") as cached_p, \ + gzip.open(subfolderCached + sourcePrefix + "%02d" % i + ".tsv.gz", "w") as cached_s, \ + gzip.open(subfolderUncached + processedPrefix + "%02d" % i + ".tsv.gz", "w") as uncached_p, \ + gzip.open(subfolderUncached + sourcePrefix + "%02d" % i + ".tsv.gz", "w") as uncached_s: + pReader = csv.DictReader(p, delimiter="\t") + sReader = csv.DictReader(s, delimiter="\t") + + cachedpWriter = csv.DictWriter(cached_p, None, delimiter="\t") + cachedsWriter = csv.DictWriter(cached_s, None, delimiter="\t") + + uncachedpWriter = csv.DictWriter(uncached_p, None, delimiter="\t") + uncachedsWriter = csv.DictWriter(uncached_s, None, delimiter="\t") + + cache = dict() + + lasttime = None + + for processed, source in izip(pReader, sReader): + if cachedpWriter.fieldnames is None: + ph = dict((h, h) for h in pReader.fieldnames) + cachedpWriter.fieldnames = pReader.fieldnames + cachedpWriter.writerow(ph) + + if uncachedpWriter.fieldnames is None: + ph = dict((h, h) for h in pReader.fieldnames) + uncachedpWriter.fieldnames = pReader.fieldnames + uncachedpWriter.writerow(ph) + + if cachedsWriter.fieldnames is None: + sh = dict((h, h) for h in sReader.fieldnames) + cachedsWriter.fieldnames = sReader.fieldnames + cachedsWriter.writerow(sh) + + if uncachedsWriter.fieldnames is None: + sh = dict((h, h) for h in sReader.fieldnames) + uncachedsWriter.fieldnames = sReader.fieldnames + uncachedsWriter.writerow(sh) + + uri_query = source["uri_query"] + try: + timestamp = parse(source["ts"]) + except ValueError: + valueErrors.append(source["ts"]) + continue + + if lasttime == None: + lasttime = timestamp + cache[uri_query] = timestamp + uncachedpWriter.writerow(processed) + uncachedsWriter.writerow(source) + else: + if (timestamp - lasttime).total_seconds() >= 60: + for k, v in cache.items(): + if (timestamp - v).total_seconds() / 60 > 5.0: + del cache[k] + if uri_query in cache: + cacheTime = cache[uri_query] + + if (timestamp - cacheTime).total_seconds() / 60 > 5.0: + uncachedpWriter.writerow(processed) + uncachedsWriter.writerow(source) + del cache[uri_query] + else: + cachedpWriter.writerow(processed) + cachedsWriter.writerow(source) + + else: + uncachedpWriter.writerow(processed) + uncachedsWriter.writerow(source) + cache[uri_query] = timestamp + +if len(valueErrors) > 0: + print "Value errors for time stamps:" + for error in valueErrors: + print error diff --git a/tools/extractQueryTypeDataRanking.py b/tools/extractQueryTypeDataRanking.py index 2883b40..8589509 100644 --- a/tools/extractQueryTypeDataRanking.py +++ b/tools/extractQueryTypeDataRanking.py @@ -4,15 +4,15 @@ import os import shutil import sys -import urlparse +import urllib.parse import pandas -from itertools import izip -import config -import fieldRanking -from utility import utility +from . import config +from . import fieldRanking + +from .utility import utility # Generates a list of all query types sorted in descending order by number of # appearance based on fieldRanking.tsv's output for the field #QueryType @@ -53,7 +53,7 @@ if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") \ and not args.ignoreLock: - print "ERROR: The month " + args.month + " is being edited at the moment." + print("ERROR: The month " + args.month + " is being edited at the moment.") + " Use -i if you want to force the execution of this script." sys.exit() @@ -86,37 +86,37 @@ queryTypes = dict() queryTypeCount = fieldRanking.fieldRanking(args.month, "QueryType", monthsFolder = args.monthsFolder, ignoreLock = args.ignoreLock) - for i, (k, v) in enumerate(sorted(queryTypeCount.iteritems(), key=lambda (k, v): (v, k), reverse=True)): + for i, (k, v) in enumerate(sorted(iter(queryTypeCount.items()), key=lambda k_v: (k_v[1], k_v[0]), reverse=True)): if i >= args.topNumber and args.topNumber != 0: break i += 1 queryTypes[k] = v - for i in xrange(1, 32): + for i in range(1, 32): if not (os.path.exists(processedPrefix + "%02d" % i + ".tsv.gz") and os.path.exists(sourcePrefix + "%02d" % i + ".tsv.gz")): continue - print "Working on %02d" % i + print("Working on %02d" % i) with gzip.open(processedPrefix + "%02d" % i + ".tsv.gz") as p, \ gzip.open(sourcePrefix + "%02d" % i + ".tsv.gz") as s: pReader = csv.DictReader(p, delimiter="\t") sReader = csv.DictReader(s, delimiter="\t") - for processed, source in izip(pReader, sReader): + for processed, source in zip(pReader, sReader): queryType = processed["QueryType"] if queryType in queryTypes: processedToWrite = dict() - d = dict(urlparse.parse_qsl( - urlparse.urlsplit(source['uri_query']).query)) - if 'query' in d.keys(): + d = dict(urllib.parse.parse_qsl( + urllib.parse.urlsplit(source['uri_query']).query)) + if 'query' in list(d.keys()): processedToWrite['ExampleQuery'] = d['query'] else: processedToWrite['ExampleQuery'] = "" - print "ERROR: Could not find query in uri_query:" - print source['uri_query'] + print("ERROR: Could not find query in uri_query:") + print(source['uri_query']) for key in processed: if key in fieldBasedOnQueryType: @@ -129,13 +129,13 @@ del queryTypes[queryType] if len(queryTypes) > 0: - print "Could not find examples for the following query types:" + print("Could not find examples for the following query types:") for key in queryTypes: - print "\t" + key + print("\t" + key) df = pandas.read_csv(pathBase + fileName, sep="\t", header=0, index_col=0) df = df.sort_values(by=["QueryTypeCount"], ascending=False) df.to_csv(pathBase + fileName, sep="\t") -print "Done." +print("Done.") diff --git a/tools/extractQueryTypeDataRanking.py.bak b/tools/extractQueryTypeDataRanking.py.bak new file mode 100644 index 0000000..2883b40 --- /dev/null +++ b/tools/extractQueryTypeDataRanking.py.bak @@ -0,0 +1,141 @@ +import argparse +import csv +import gzip +import os +import shutil +import sys +import urlparse + +import pandas +from itertools import izip + +import config +import fieldRanking + +from utility import utility + +# Generates a list of all query types sorted in descending order by number of +# appearance based on fieldRanking.tsv's output for the field #QueryType +# The file contains all metrics that are specific for the query type (ignoring +# things like subject and object names) as well as one example query for this +# query type + +fieldBasedOnQueryType = ['Valid', 'QuerySize', 'VariableCountHead', + 'VariableCountPattern', 'TripleCountWithService', + 'QueryComplexity', 'SubjectsAndObjects', 'Predicates', + 'Categories', 'UsedSparqlFeatures'] + +# Number of query types to be extracted, use 0 for infinity + +parser = argparse.ArgumentParser( + description="Generates a list of all query types sorted in descending" + + " order by number of appearance based on fieldRanking.py's output for" + + " the field #QueryType") +parser.add_argument("--monthsFolder", "-m", + default=config.monthsFolder, type=str, + help="The folder in which the months directories are" + + " residing.") +parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and execute" + + " anyways", action="store_true") +parser.add_argument("--topNumber", "-n", default=0, type=int, + help="The top n query types should be present in the " + + "generated file.") +parser.add_argument("month", type=str, + help="The month from which the query type file should be " + + "generated.") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + + utility.addMissingSlash(args.month) + "locked") \ + and not args.ignoreLock: + print "ERROR: The month " + args.month + " is being edited at the moment." + + " Use -i if you want to force the execution of this script." + sys.exit() + +os.chdir(utility.addMissingSlash(args.monthsFolder) + + utility.addMissingSlash(args.month)) + +pathBase = "queryTypeDataRanking/" +fileName = "Query_Type_Data_Ranking.tsv" + +processedPrefix = config.processedPrefix +sourcePrefix = config.sourcePrefix + +if not os.path.exists(pathBase): + os.makedirs(pathBase) + +with open(pathBase + fileName, "w") as types: + typeWriter = csv.DictWriter(types, None, delimiter="\t") + + th = {"QueryType":"QueryType", "QueryTypeCount":"QueryTypeCount", "ExampleQuery":"ExampleQuery"} + + typeWriter.fieldnames = ["QueryType", "QueryTypeCount", "ExampleQuery"] + + for h in fieldBasedOnQueryType: + th[h] = h + typeWriter.fieldnames.append(h) + typeWriter.writerow(th) + + i = 0 + + queryTypes = dict() + + queryTypeCount = fieldRanking.fieldRanking(args.month, "QueryType", monthsFolder = args.monthsFolder, ignoreLock = args.ignoreLock) + for i, (k, v) in enumerate(sorted(queryTypeCount.iteritems(), key=lambda (k, v): (v, k), reverse=True)): + if i >= args.topNumber and args.topNumber != 0: + break + i += 1 + + queryTypes[k] = v + + for i in xrange(1, 32): + if not (os.path.exists(processedPrefix + "%02d" % i + ".tsv.gz") + and os.path.exists(sourcePrefix + "%02d" % i + ".tsv.gz")): + continue + print "Working on %02d" % i + with gzip.open(processedPrefix + "%02d" % i + ".tsv.gz") as p, \ + gzip.open(sourcePrefix + "%02d" % i + ".tsv.gz") as s: + pReader = csv.DictReader(p, delimiter="\t") + sReader = csv.DictReader(s, delimiter="\t") + + for processed, source in izip(pReader, sReader): + queryType = processed["QueryType"] + if queryType in queryTypes: + + processedToWrite = dict() + + d = dict(urlparse.parse_qsl( + urlparse.urlsplit(source['uri_query']).query)) + if 'query' in d.keys(): + processedToWrite['ExampleQuery'] = d['query'] + else: + processedToWrite['ExampleQuery'] = "" + print "ERROR: Could not find query in uri_query:" + print source['uri_query'] + + for key in processed: + if key in fieldBasedOnQueryType: + processedToWrite[key] = processed[key] + + processedToWrite["QueryType"] = queryType + processedToWrite["QueryTypeCount"] = queryTypes[queryType] + + typeWriter.writerow(processedToWrite) + del queryTypes[queryType] + +if len(queryTypes) > 0: + print "Could not find examples for the following query types:" + for key in queryTypes: + print "\t" + key + +df = pandas.read_csv(pathBase + fileName, sep="\t", + header=0, index_col=0) +df = df.sort_values(by=["QueryTypeCount"], ascending=False) +df.to_csv(pathBase + fileName, sep="\t") + +print "Done." diff --git a/tools/extractQueryTypeDataset.py b/tools/extractQueryTypeDataset.py index 5b845e3..7cd5954 100644 --- a/tools/extractQueryTypeDataset.py +++ b/tools/extractQueryTypeDataset.py @@ -1,11 +1,11 @@ import argparse -import config +from . import config import csv import gzip import os import sys -from itertools import izip -from utility import utility + +from .utility import utility parser = argparse.ArgumentParser( description="Creates a subset of the raw log files and the processed log " @@ -34,7 +34,7 @@ if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") \ and not args.ignoreLock: - print "ERROR: The month " + args.month + " is being edited at the moment." + print("ERROR: The month " + args.month + " is being edited at the moment.") + "Use -i if you want to force the execution of this script." sys.exit() @@ -57,11 +57,11 @@ usedQueryTypes = set() - for i in xrange(1, 32): + for i in range(1, 32): if not (os.path.exists(subfolder + processedPrefix + "%02d" % i + ".tsv.gz") and gzip.os.path.exists(subfolder + sourcePrefix + "%02d" % i + ".tsv.gz")): continue - print "Working on %02d" % i + print("Working on %02d" % i) with gzip.open(subfolder + processedPrefix + "%02d" % i + ".tsv.gz") as p, \ gzip.open(subfolder + sourcePrefix + "%02d" % i + ".tsv.gz") as s, \ gzip.open(subfolderUnique + processedPrefix + "%02d" % i @@ -74,7 +74,7 @@ pWriter = csv.DictWriter(user_p, None, delimiter="\t") sWriter = csv.DictWriter(user_s, None, delimiter="\t") - for processed, source in izip(pReader, sReader): + for processed, source in zip(pReader, sReader): if pWriter.fieldnames is None: ph = dict((h, h) for h in pReader.fieldnames) pWriter.fieldnames = pReader.fieldnames diff --git a/tools/extractQueryTypeDataset.py.bak b/tools/extractQueryTypeDataset.py.bak new file mode 100644 index 0000000..5b845e3 --- /dev/null +++ b/tools/extractQueryTypeDataset.py.bak @@ -0,0 +1,91 @@ +import argparse +import config +import csv +import gzip +import os +import sys +from itertools import izip +from utility import utility + +parser = argparse.ArgumentParser( + description="Creates a subset of the raw log files and the processed log " + + "files where for each present Unique Query an example Query is being " + + "inserted") +parser.add_argument( + "--monthsFolder", + "-m", + default=config.monthsFolder, + type=str, + help="the folder in which the months directory " + "are residing") +parser.add_argument( + "--ignoreLock", + "-i", + help="Ignore locked file and execute anyways", + action="store_true") +parser.add_argument( + "month", type=str, help="the month which we're interested in") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + + utility.addMissingSlash(args.month) + "locked") \ + and not args.ignoreLock: + print "ERROR: The month " + args.month + " is being edited at the moment." + + "Use -i if you want to force the execution of this script." + sys.exit() + +for monthName in args.month.split(","): + subfolder = utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(monthName) + subfolderUnique = subfolder + "queryTypeDataset/" + + processedFolderName = "processedLogData/" + sourceFolderName = "rawLogData/" + + processedPrefix = processedFolderName + "QueryProcessedOpenRDF" + sourcePrefix = sourceFolderName + "QueryCnt" + + if not os.path.exists(subfolderUnique): + os.makedirs(subfolderUnique) + if not os.path.exists(subfolderUnique + processedFolderName): + os.makedirs(subfolderUnique + processedFolderName) + if not os.path.exists(subfolderUnique + sourceFolderName): + os.makedirs(subfolderUnique + sourceFolderName) + + usedQueryTypes = set() + + for i in xrange(1, 32): + if not (os.path.exists(subfolder + processedPrefix + "%02d" % i + ".tsv.gz") + and gzip.os.path.exists(subfolder + sourcePrefix + "%02d" % i + ".tsv.gz")): + continue + print "Working on %02d" % i + with gzip.open(subfolder + processedPrefix + "%02d" % i + ".tsv.gz") as p, \ + gzip.open(subfolder + sourcePrefix + "%02d" % i + ".tsv.gz") as s, \ + gzip.open(subfolderUnique + processedPrefix + "%02d" % i + + ".tsv.gz", "w") as user_p, \ + gzip.open(subfolderUnique + sourcePrefix + "%02d" % i + + ".tsv.gz", "w") as user_s: + pReader = csv.DictReader(p, delimiter="\t") + sReader = csv.DictReader(s, delimiter="\t") + + pWriter = csv.DictWriter(user_p, None, delimiter="\t") + sWriter = csv.DictWriter(user_s, None, delimiter="\t") + + for processed, source in izip(pReader, sReader): + if pWriter.fieldnames is None: + ph = dict((h, h) for h in pReader.fieldnames) + pWriter.fieldnames = pReader.fieldnames + pWriter.writerow(ph) + + if sWriter.fieldnames is None: + sh = dict((h, h) for h in sReader.fieldnames) + sWriter.fieldnames = sReader.fieldnames + sWriter.writerow(sh) + + if (processed['QueryType'] not in usedQueryTypes): + pWriter.writerow(processed) + sWriter.writerow(source) + usedQueryTypes.add(processed['QueryType']) diff --git a/tools/featureVectors.py b/tools/featureVectors.py index e091839..ef1c995 100644 --- a/tools/featureVectors.py +++ b/tools/featureVectors.py @@ -2,17 +2,17 @@ import os import sys from collections import defaultdict -from postprocess import processdata -from utility import utility +from .postprocess import processdata +from .utility import utility -import config -import fieldRanking +from . import config +from . import fieldRanking def featureVectors(month, metric, monthsFolder = config.monthsFolder, threshold = 100, ignoreLock = False, outputPath = None, outputFilename = None, filterParams = "", writeOut = False, notifications = True): if os.path.isfile(utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "locked") \ and not ignoreLock: - print "ERROR: The month " + month + " is being edited at the moment." + print("ERROR: The month " + month + " is being edited at the moment.") + " Use -i or ignoreLock = True if you want to force the execution of this script." sys.exit() @@ -39,7 +39,7 @@ def featureVectors(month, metric, monthsFolder = config.monthsFolder, threshold vectorEntries = set() result = fieldRanking.fieldRanking(month, metric, monthsFolder, ignoreLock = ignoreLock, outputPath = outputPath, outputFilename = outputFilename, filterParams = filterParams, nosplitting = True, writeOut = False, notifications = notifications) - for keyOneEntry, keyOneEntryCount in sorted(result.iteritems(), key = lambda (k, v): (v, k), reverse = True): + for keyOneEntry, keyOneEntryCount in sorted(iter(result.items()), key = lambda k_v: (k_v[1], k_v[0]), reverse = True): if keyOneEntryCount < threshold: break diff --git a/tools/featureVectors.py.bak b/tools/featureVectors.py.bak new file mode 100644 index 0000000..e091839 --- /dev/null +++ b/tools/featureVectors.py.bak @@ -0,0 +1,108 @@ +import argparse +import os +import sys +from collections import defaultdict +from postprocess import processdata +from utility import utility + +import config +import fieldRanking + +def featureVectors(month, metric, monthsFolder = config.monthsFolder, threshold = 100, ignoreLock = False, outputPath = None, outputFilename = None, filterParams = "", writeOut = False, notifications = True): + if os.path.isfile(utility.addMissingSlash(monthsFolder) + + utility.addMissingSlash(month) + "locked") \ + and not ignoreLock: + print "ERROR: The month " + month + " is being edited at the moment." + + " Use -i or ignoreLock = True if you want to force the execution of this script." + sys.exit() + + metric = utility.argMetric(metric) + + pathBase = utility.addMissingSlash(monthsFolder) \ + + utility.addMissingSlash(month) \ + + utility.addMissingSlash(metric) + + if outputPath is not None: + pathBase = utility.addMissingSlash(outputPath) + + outputFile = month.strip("/").replace("/", "_") + "_" + metric + "_feature_vectors.tsv" + + if outputFilename is not None: + outputFile = outputFilename + + filter = utility.filter() + + filter.setup(filterParams) + + vectors = list() + + vectorEntries = set() + + result = fieldRanking.fieldRanking(month, metric, monthsFolder, ignoreLock = ignoreLock, outputPath = outputPath, outputFilename = outputFilename, filterParams = filterParams, nosplitting = True, writeOut = False, notifications = notifications) + for keyOneEntry, keyOneEntryCount in sorted(result.iteritems(), key = lambda (k, v): (v, k), reverse = True): + if keyOneEntryCount < threshold: + break + + if metric not in utility.notToSplit: + entries = utility.splitEntry(keyOneEntry) + else: + entries = [keyOneEntry] + + newVector = defaultdict(int) + + for entry in entries: + newVector[entry] = 1 + vectorEntries.add(entry) + + newVector["count"] = keyOneEntryCount + + vectors.append(newVector) + + if writeOut: + if not os.path.exists(pathBase): + os.makedirs(pathBase) + with open(pathBase + outputFile, "w") as file: + headerEntries = sorted(vectorEntries) + for headerEntry in headerEntries: + file.write(str(headerEntry) + "\t") + file.write("count\n") + for vector in sorted(vectors, key=lambda entry: entry["count"], reverse=True): + for headerEntry in headerEntries: + file.write(str(vector[headerEntry]) + "\t") + file.write(str(vector['count']) + "\n") + return vectors + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="This script creates a table with columns for all features vectors that occured more than threshold times.") + parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, help="The folder in which the months directory " + + "are residing.") + parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and execute" + + " anyways", action="store_true") + parser.add_argument("--suppressNotifications", "-s", help = "Suppress notifications from processdata.py.", action = "store_true") + parser.add_argument("--outputPath", "-p", type=str, help="The path where the " + + "output file should be generated.") + parser.add_argument("--outputFilename", "-o", type=str, help="The name of the output file to be generated.") + parser.add_argument("--filter", "-f", default="", type=str, help="Constraints " + + "used to limit the lines used to generate the output." + + " Default filter is Valid=^VALID$." + + " Enter as =,/ (e.g." + + " QueryType=wikidataLastModified,ToolName=^USER$)" + + " NOTE: If you use this option you should probably also" + + " set the --outputPath to some value other than the " + + "default.") + parser.add_argument("--threshold", "-t", default = 100, type = int, help = "The threshold above which the entries should be counted. Default is 100.") + parser.add_argument("metric", type=str, + help="The metric that should be analysed") + parser.add_argument("month", type=str, + help="The month for which the feature vectors should be generated.") + + + if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + + args = parser.parse_args() + + featureVectors(args.month, args.metric, monthsFolder = args.monthsFolder, threshold = args.threshold, ignoreLock = args.ignoreLock, outputPath = args.outputPath, outputFilename = args.outputFilename, filterParams = args.filter, writeOut = True, notifications = not args.suppressNotifications) diff --git a/tools/fieldEntriesDaysApart.py b/tools/fieldEntriesDaysApart.py index 30c126a..9576ef7 100644 --- a/tools/fieldEntriesDaysApart.py +++ b/tools/fieldEntriesDaysApart.py @@ -3,14 +3,14 @@ import sys from collections import defaultdict from dateutil import parser as dateparser -from postprocess import processdata -from utility import utility -import config +from .postprocess import processdata +from .utility import utility +from . import config def fieldEntriesDaysApart(months, metric, days, monthsFolder = config.monthsFolder, ignoreLock = False, outputPath = None, outputFilename = None, filterParams = "", nosplitting = False, writeOut = False, notifications = True, anonymous = False): for month in months.split(","): if os.path.isfile(utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "locked") and not ignoreLock: - print "ERROR: The month " + month + " is being edited at the moment." + " Use -i or ignoreLock = True if you want to force the execution of this script." + print("ERROR: The month " + month + " is being edited at the moment." + " Use -i or ignoreLock = True if you want to force the execution of this script.") sys.exit() metric = utility.argMetric(metric) @@ -54,7 +54,7 @@ def handle(self, sparqlQuery, processed): try: parsedTime = dateparser.parse(timestamp) except ValueError: - print "ERROR: Faulty timestamp " + str(timestamp) + print("ERROR: Faulty timestamp " + str(timestamp)) faultyTimestamps[timestamp] += 1 continue if not key in self.firstSeen: @@ -64,7 +64,7 @@ def handle(self, sparqlQuery, processed): self.lastSeen[key] = parsedTime def compute(self): - for key, firstTS in self.firstSeen.iteritems(): + for key, firstTS in self.firstSeen.items(): lastTS = self.lastSeen[key] if (lastTS - firstTS).days >= days: self.fieldEntries.add(key) @@ -86,9 +86,9 @@ def writeOut(self): handler.compute() if len(faultyTimestamps) > 0: - print "Faulty timestamp\tcount" - for k, v in sorted(faultyTimestamps.iteritems(), key=lambda (k, v): (v, k), reverse=True): - print str(k) + "\t" + str(v) + print("Faulty timestamp\tcount") + for k, v in sorted(iter(faultyTimestamps.items()), key=lambda k_v: (k_v[1], k_v[0]), reverse=True): + print(str(k) + "\t" + str(v)) if writeOut: if not os.path.exists(pathBase): diff --git a/tools/fieldEntriesDaysApart.py.bak b/tools/fieldEntriesDaysApart.py.bak new file mode 100644 index 0000000..30c126a --- /dev/null +++ b/tools/fieldEntriesDaysApart.py.bak @@ -0,0 +1,139 @@ +import argparse +import os +import sys +from collections import defaultdict +from dateutil import parser as dateparser +from postprocess import processdata +from utility import utility +import config + +def fieldEntriesDaysApart(months, metric, days, monthsFolder = config.monthsFolder, ignoreLock = False, outputPath = None, outputFilename = None, filterParams = "", nosplitting = False, writeOut = False, notifications = True, anonymous = False): + for month in months.split(","): + if os.path.isfile(utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "locked") and not ignoreLock: + print "ERROR: The month " + month + " is being edited at the moment." + " Use -i or ignoreLock = True if you want to force the execution of this script." + sys.exit() + + metric = utility.argMetric(metric) + + pathBase = utility.addMissingSlash(monthsFolder) \ + + utility.addMissingSlash(months.replace("/", "_")) \ + + utility.addMissingSlash(metric) + + if outputPath is not None: + pathBase = utility.addMissingSlash(outputPath) + + addString = "" + if anonymous: + addString = "_anonymous_" + + outputFile = month.strip("/").replace("/", "_") + "_" + metric + addString + "_" + str(days) + "_days_apart.tsv" + + if outputFilename is not None: + outputFile = outputFilename + + header = metric + "\n" + + filter = utility.filter() + + filter.setup(filterParams) + + faultyTimestamps = defaultdict(int) + + class FieldEntriesDaysApartHandler: + firstSeen = dict() + lastSeen = dict() + + fieldEntries = set() + + def handle(self, sparqlQuery, processed): + if not filter.checkLine(processed): + return + + for key in utility.fetchEntries(processed, metric, nosplitting = nosplitting): + timestamp = processed["timestamp"] + try: + parsedTime = dateparser.parse(timestamp) + except ValueError: + print "ERROR: Faulty timestamp " + str(timestamp) + faultyTimestamps[timestamp] += 1 + continue + if not key in self.firstSeen: + self.firstSeen[key] = parsedTime + self.lastSeen[key] = parsedTime + if parsedTime > self.lastSeen[key]: + self.lastSeen[key] = parsedTime + + def compute(self): + for key, firstTS in self.firstSeen.iteritems(): + lastTS = self.lastSeen[key] + if (lastTS - firstTS).days >= days: + self.fieldEntries.add(key) + + def writeOut(self): + with open(pathBase + outputFile, "w") as file: + file.write(header) + for key in self.fieldEntries: + file.write(str(key) + "\n") + + handler = FieldEntriesDaysApartHandler() + + for month in months.split(","): + if anonymous: + processdata.processMonth(handler, month, monthsFolder, anonymous = True, notifications = notifications) + else: + processdata.processMonth(handler, month, monthsFolder, notifications = notifications) + + handler.compute() + + if len(faultyTimestamps) > 0: + print "Faulty timestamp\tcount" + for k, v in sorted(faultyTimestamps.iteritems(), key=lambda (k, v): (v, k), reverse=True): + print str(k) + "\t" + str(v) + + if writeOut: + if not os.path.exists(pathBase): + os.makedirs(pathBase) + handler.writeOut() + return handler.fieldEntries + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="This script creates a list of all entries in a metric that are at least N days apart.") + parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, help="The folder in which the months directory " + + "are residing.") + parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and execute" + + " anyways", action="store_true") + parser.add_argument("--suppressNotifications", "-s", help = "Suppress notifications from processdata.py.", action = "store_true") + parser.add_argument("--outputPath", "-p", type=str, help="The path where the " + + "output file should be generated.") + parser.add_argument("--outputFilename", "-o", type=str, help="The name of the output file to be generated.") + parser.add_argument("--filter", "-f", default="", type=str, help="Constraints " + + "used to limit the lines used to generate the output." + + " Default filter is Valid=^VALID$." + + " Enter as =,/ (e.g." + + " QueryType=wikidataLastModified,ToolName=^USER$)" + + " NOTE: If you use this option you should probably also" + + " set the --outputPath to some value other than the " + + "default.") + parser.add_argument("--nosplitting", "-n", help="Check if you do not want the" + + " script to split entries at commas and count each part" + + " separately but instead just to sort such entries and " + + "count them as a whole.", action="store_true") + parser.add_argument("--anonymous", "-a", action="store_true", help="Check to switch to ranking the anonymous data." + + " WARNING: No processed metrics are available for anonymous data because the anonymous files" + + " do not synch up to the processed files due to dropping the invalid lines.") + parser.add_argument("metric", type=str, + help="The metric that should be ranked") + parser.add_argument("months", type=str, + help="The months for which the ranking should be " + +"generated.") + parser.add_argument("days", type=int, help="How many days should be between the entries.") + + + if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + + args = parser.parse_args() + fieldEntriesDaysApart(args.months, args.metric, args.days, monthsFolder = args.monthsFolder, ignoreLock = args.ignoreLock, outputPath = args.outputPath, outputFilename = args.outputFilename, filterParams = args.filter, nosplitting = args.nosplitting, writeOut = True, notifications = not args.suppressNotifications, anonymous = args.anonymous) diff --git a/tools/fieldRanking.py b/tools/fieldRanking.py index 1ef1a87..2831584 100644 --- a/tools/fieldRanking.py +++ b/tools/fieldRanking.py @@ -2,15 +2,15 @@ import os import sys from collections import defaultdict -from postprocess import processdata -from utility import utility -import config +from .postprocess import processdata +from .utility import utility +from . import config def fieldRanking(month, metric, monthsFolder = config.monthsFolder, ignoreLock = False, outputPath = None, outputFilename = None, filterParams = "", nosplitting = False, writeOut = False, notifications = True, anonymous = False): if os.path.isfile(utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "locked") \ and not ignoreLock: - print "ERROR: The month " + month + " is being edited at the moment." + print("ERROR: The month " + month + " is being edited at the moment.") + " Use -i or ignoreLock = True if you want to force the execution of this script." sys.exit() @@ -52,7 +52,7 @@ def handle(self, sparqlQuery, processed): def writeOut(self): with open(pathBase + outputFile, "w") as file: file.write(header) - for k, v in sorted(self.totalMetricCounts.iteritems(), key=lambda (k, v): (v, k), reverse=True): + for k, v in sorted(iter(self.totalMetricCounts.items()), key=lambda k_v: (k_v[1], k_v[0]), reverse=True): file.write(str(k) + "\t" + str(v) + "\n") handler = FieldRankingHandler() diff --git a/tools/fieldRanking.py.bak b/tools/fieldRanking.py.bak new file mode 100644 index 0000000..1ef1a87 --- /dev/null +++ b/tools/fieldRanking.py.bak @@ -0,0 +1,114 @@ +import argparse +import os +import sys +from collections import defaultdict +from postprocess import processdata +from utility import utility +import config + +def fieldRanking(month, metric, monthsFolder = config.monthsFolder, ignoreLock = False, outputPath = None, outputFilename = None, filterParams = "", nosplitting = False, writeOut = False, notifications = True, anonymous = False): + if os.path.isfile(utility.addMissingSlash(monthsFolder) + + utility.addMissingSlash(month) + "locked") \ + and not ignoreLock: + print "ERROR: The month " + month + " is being edited at the moment." + + " Use -i or ignoreLock = True if you want to force the execution of this script." + sys.exit() + + metric = utility.argMetric(metric) + + pathBase = utility.addMissingSlash(monthsFolder) \ + + utility.addMissingSlash(month) \ + + utility.addMissingSlash(metric) + + if outputPath is not None: + pathBase = utility.addMissingSlash(outputPath) + + addString = "" + if anonymous: + addString = "_anonymous_" + + outputFile = month.strip("/").replace("/", "_") + "_" + metric + addString + "_Ranking.tsv" + + if outputFilename is not None: + outputFile = outputFilename + + header = metric + "\t" + metric + "_count\n" + + filter = utility.filter() + + filter.setup(filterParams) + + + class FieldRankingHandler: + totalMetricCounts = defaultdict(int) + + def handle(self, sparqlQuery, processed): + if not filter.checkLine(processed): + return + + for key in utility.fetchEntries(processed, metric, nosplitting = nosplitting): + self.totalMetricCounts[key] += 1 + + def writeOut(self): + with open(pathBase + outputFile, "w") as file: + file.write(header) + for k, v in sorted(self.totalMetricCounts.iteritems(), key=lambda (k, v): (v, k), reverse=True): + file.write(str(k) + "\t" + str(v) + "\n") + + handler = FieldRankingHandler() + + if anonymous: + processdata.processMonth(handler, month, monthsFolder, anonymous = True, notifications = notifications) + else: + processdata.processMonth(handler, month, monthsFolder, notifications = notifications) + + + + if writeOut: + if not os.path.exists(pathBase): + os.makedirs(pathBase) + handler.writeOut() + return handler.totalMetricCounts + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="This script creates descending rankings for each day for all" + + "metrics given.") + parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, help="The folder in which the months directory " + + "are residing.") + parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and execute" + + " anyways", action="store_true") + parser.add_argument("--suppressNotifications", "-s", help = "Suppress notifications from processdata.py.", action = "store_true") + parser.add_argument("--outputPath", "-p", type=str, help="The path where the " + + "output file should be generated.") + parser.add_argument("--outputFilename", "-o", type=str, help="The name of the output file to be generated.") + parser.add_argument("--filter", "-f", default="", type=str, help="Constraints " + + "used to limit the lines used to generate the output." + + " Default filter is Valid=^VALID$." + + " Enter as =,/ (e.g." + + " QueryType=wikidataLastModified,ToolName=^USER$)" + + " NOTE: If you use this option you should probably also" + + " set the --outputPath to some value other than the " + + "default.") + parser.add_argument("--nosplitting", "-n", help="Check if you do not want the" + + " script to split entries at commas and count each part" + + " separately but instead just to sort such entries and " + + "count them as a whole.", action="store_true") + parser.add_argument("--anonymous", "-a", action="store_true", help="Check to switch to ranking the anonymous data." + + " WARNING: No processed metrics are available for anonymous data because the anonymous files" + + " do not synch up to the processed files due to dropping the invalid lines.") + parser.add_argument("metric", type=str, + help="The metric that should be ranked") + parser.add_argument("month", type=str, + help="The month for which the ranking should be " + +"generated.") + + + if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + + args = parser.parse_args() + + fieldRanking(args.month, args.metric, monthsFolder = args.monthsFolder, ignoreLock = args.ignoreLock, outputPath = args.outputPath, outputFilename = args.outputFilename, filterParams = args.filter, nosplitting = args.nosplitting, writeOut = True, notifications = not args.suppressNotifications, anonymous = args.anonymous) diff --git a/tools/generalStat.py b/tools/generalStat.py index ec698c3..2f31dd0 100644 --- a/tools/generalStat.py +++ b/tools/generalStat.py @@ -4,9 +4,9 @@ from collections import defaultdict from pprint import pprint -import config -from postprocess import processdata -from utility import utility +from . import config +from .postprocess import processdata +from .utility import utility import itertools parser = argparse.ArgumentParser( @@ -40,9 +40,9 @@ if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") \ and not args.ignoreLock: - print("ERROR: The month " + str(args.month) + + print(("ERROR: The month " + str(args.month) + " is being edited at the moment." + - " Use -i if you want to force the execution of this script.") + " Use -i if you want to force the execution of this script.")) sys.exit() @@ -65,12 +65,12 @@ def printStat(self): print( "Month\tFirst\tCopy\tSIMPLE\tCOMPLEX\tEXAMPLE_STRING\tEXAMPLE_PARSED" ) - print( + print(( args.month + "\t" + str(self.statistic["FIRST"]) + "\t" + str(self.statistic["COPY"]) + "\t" + str(self.statistic["SIMPLE"]) + "\t" + str(self.statistic["COMPLEX"]) + "\t" + str(self.statistic["EXAMPLE_STRING"]) + "\t" + - str(self.statistic["EXAMPLE_PARSED"])) + str(self.statistic["EXAMPLE_PARSED"]))) handler = GeneralStatisticsHandler() @@ -78,7 +78,7 @@ def printStat(self): processdata.processMonth( handler, args.month, args.monthsFolder, notifications=False) -print args.position -print "" +print(args.position) +print("") handler.printStat() diff --git a/tools/generalStat.py.bak b/tools/generalStat.py.bak new file mode 100644 index 0000000..ec698c3 --- /dev/null +++ b/tools/generalStat.py.bak @@ -0,0 +1,84 @@ +import argparse +import os +import sys +from collections import defaultdict +from pprint import pprint + +import config +from postprocess import processdata +from utility import utility +import itertools + +parser = argparse.ArgumentParser( + description="Prints out general statistics about FIRST/COPY") +parser.add_argument( + "--monthsFolder", + "-m", + default=config.monthsFolder, + type=str, + help="the folder in which the months directory " + "are residing") +parser.add_argument( + "--ignoreLock", + "-i", + help="Ignore locked file and execute" + " anyways", + action="store_true") +parser.add_argument( + "--position", + "-p", + default="default position", + type=str, + help="The position to be displayed before the data.") +parser.add_argument( + "month", type=str, help="the month which we're interested in") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + + utility.addMissingSlash(args.month) + "locked") \ + and not args.ignoreLock: + print("ERROR: The month " + str(args.month) + + " is being edited at the moment." + + " Use -i if you want to force the execution of this script.") + sys.exit() + + +class GeneralStatisticsHandler: + statistic = defaultdict(int) + totalCount = 0 + + def handle(self, sparqlQuery, processed): + if (processed['#Valid'] == 'VALID'): + self.totalCount += 1 + self.statistic[processed['#First']] += 1 + self.statistic[processed['#QueryComplexity']] += 1 + if processed['#ExampleQueryStringComparison'] != "NONE": + self.statistic['EXAMPLE_STRING'] += 1 + if processed['#ExampleQueryParsedComparison'] != "NONE": + self.statistic['EXAMPLE_PARSED'] += 1 + + def printStat(self): + #pprint(self.statistic) + print( + "Month\tFirst\tCopy\tSIMPLE\tCOMPLEX\tEXAMPLE_STRING\tEXAMPLE_PARSED" + ) + print( + args.month + "\t" + str(self.statistic["FIRST"]) + "\t" + + str(self.statistic["COPY"]) + "\t" + str(self.statistic["SIMPLE"]) + + "\t" + str(self.statistic["COMPLEX"]) + "\t" + + str(self.statistic["EXAMPLE_STRING"]) + "\t" + + str(self.statistic["EXAMPLE_PARSED"])) + + +handler = GeneralStatisticsHandler() + +processdata.processMonth( + handler, args.month, args.monthsFolder, notifications=False) + +print args.position +print "" + +handler.printStat() diff --git a/tools/geoHeatMap.py b/tools/geoHeatMap.py index 49ddabc..31420d1 100644 --- a/tools/geoHeatMap.py +++ b/tools/geoHeatMap.py @@ -5,9 +5,9 @@ import os.path import cartopy.crs as ccrs import matplotlib.pyplot as plt -from postprocess import processdata -from utility import utility -import config +from .postprocess import processdata +from .utility import utility +from . import config parser = argparse.ArgumentParser( description="Generaets a heatmap based on geo coordinates") @@ -28,7 +28,7 @@ if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") \ and not args.ignoreLock: - print "ERROR: The month " + args.month + " is being edited at the moment." + print("ERROR: The month " + args.month + " is being edited at the moment.") + " Use -i if you want to force the execution of this script." sys.exit() diff --git a/tools/geoHeatMap.py.bak b/tools/geoHeatMap.py.bak new file mode 100644 index 0000000..49ddabc --- /dev/null +++ b/tools/geoHeatMap.py.bak @@ -0,0 +1,72 @@ +import argparse +import os +from pprint import pprint +import sys +import os.path +import cartopy.crs as ccrs +import matplotlib.pyplot as plt +from postprocess import processdata +from utility import utility +import config + +parser = argparse.ArgumentParser( + description="Generaets a heatmap based on geo coordinates") +parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, help="the folder in which the months directory " + + "are residing") +parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and execute" + + " anyways", action="store_true") +parser.add_argument("month", type=str, + help="the month which we're interested in") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + + utility.addMissingSlash(args.month) + "locked") \ + and not args.ignoreLock: + print "ERROR: The month " + args.month + " is being edited at the moment." + + " Use -i if you want to force the execution of this script." + sys.exit() + + +class GeoCoordinateCollectorHandler: + coordinates = set() + + def handle(self, sparqlQuery, processed): + if (processed['#Valid'] == 'VALID' or processed['#Valid'] == '1'): + if(processed['#Coordinates'] is not ''): + for coordinate in processed['#Coordinates'].split(","): + self.coordinates.add(coordinate) + + def saveSetToJson(self): + pprint(self.coordinates) + with open('geoCoordinates.tsv', 'w') as geoCoordinatesFile: + for coordinate in self.coordinates: + geoCoordinatesFile.write(coordinate.replace(" ", "\t") + "\n") + + +if not os.path.isfile('geoCoordinates.tsv'): + # first get all geo coordinates and save them to a file + handler = GeoCoordinateCollectorHandler() + processdata.processMonth(handler, args.month, args.monthsFolder) + handler.saveSetToJson() + +else: + # parse geoCoordinates.tsv and create choropleth map + + ax = plt.axes(projection=ccrs.PlateCarree()) + ax.coastlines() + ax.stock_img() + + with open('geoCoordinates.tsv', 'r') as file: + for line in file: + lat, lon = line.strip('\n').split(" ") + plt.plot(float(lat), float(lon), color='red', + alpha=.3, marker='.', transform=ccrs.PlateCarree()) + # pprint(line.split(" ")) + + plt.show() diff --git a/tools/getDriveStatistics.py b/tools/getDriveStatistics.py index b213d4b..bb9bf29 100644 --- a/tools/getDriveStatistics.py +++ b/tools/getDriveStatistics.py @@ -3,10 +3,10 @@ import subprocess import sys -from utility import utility -import config -import fieldRanking -import xyMapping +from .utility import utility +from . import config +from . import fieldRanking +from . import xyMapping os.nice(19) @@ -31,24 +31,24 @@ os.makedirs(statisticsSubfolder) def fieldRankingOn(monthFolder, metric, filename): - print "Working with fieldRanking " + metric + " on " + filename + print("Working with fieldRanking " + metric + " on " + filename) fieldRanking.fieldRanking(monthFolder, metric, monthsFolder = args.monthsFolder, outputPath = statisticsSubfolder + metric + "_Ranking", outputFilename = filename, writeOut = True, notifications = False) def xyMappingOn(monthFolder, metricOne, metricTwo, filename, nosplitOne = False, nosplitTwo = False): - print "Working with xyMapping " + metricOne + " " + metricTwo + " on " + filename + print("Working with xyMapping " + metricOne + " " + metricTwo + " on " + filename) xyMapping.xyMapping(monthFolder, metricOne, metricTwo, monthsFolder = args.monthsFolder, outputPath = statisticsSubfolder + metricOne + "_" + metricTwo, outputFilename = filename, nosplittingOne = nosplitOne, nosplittingTwo = nosplitTwo, writeOut = True, notifications = False) for monthName in args.months.split(","): cleanMonthName = monthName.strip("/").replace("/", "SLASH") if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(monthName) + "locked") and not args.ignoreLock: - print "ERROR: The month " + monthName + " is being edited at the moment. Use -i if you want to force the execution of this script." + print("ERROR: The month " + monthName + " is being edited at the moment. Use -i if you want to force the execution of this script.") continue month = utility.addMissingSlash(monthName) - for secondKey, secondFolder in {"user":"userData", "nonUser":"nonUserData"}.iteritems(): - for thirdKey, thirdFolder in {"all":"", "queryType":"queryTypeDataset"}.iteritems(): + for secondKey, secondFolder in {"user":"userData", "nonUser":"nonUserData"}.items(): + for thirdKey, thirdFolder in {"all":"", "queryType":"queryTypeDataset"}.items(): monthFolder = month + secondFolder + "/" + thirdFolder + "/" monthFolder = monthFolder.strip("/") @@ -64,11 +64,11 @@ def xyMappingOn(monthFolder, metricOne, metricTwo, filename, nosplitOne = False, if thirdKey is not "queryType": fieldRankingOn(monthFolder, "QueryType", filename) xyMappingOn(monthFolder, "UsedSparqlFeatures", "QuerySize", filename) - for script, scriptFolder in {"getSparqlStatistic.py":"sparqlFeatures", "operatorUsageStatistic.py":"operatorUsage", "generalStat.py":"generalStats"}.iteritems(): + for script, scriptFolder in {"getSparqlStatistic.py":"sparqlFeatures", "operatorUsageStatistic.py":"operatorUsage", "generalStat.py":"generalStats"}.items(): folder = utility.addMissingSlash(statisticsSubfolder + scriptFolder) if not os.path.exists(folder): os.makedirs(folder) - print "Working with " + script + " on " + filename + print("Working with " + script + " on " + filename) with open(folder + filename, "w") as f: if subprocess.call(['python', script, monthFolder, '-m', args.monthsFolder, '-p', monthName + "\n" + secondKey + "\n" + thirdKey], stdout = f) != 0: - print "ERROR: Could not calculate " + filename + print("ERROR: Could not calculate " + filename) diff --git a/tools/getDriveStatistics.py.bak b/tools/getDriveStatistics.py.bak new file mode 100644 index 0000000..b213d4b --- /dev/null +++ b/tools/getDriveStatistics.py.bak @@ -0,0 +1,74 @@ +import argparse +import os +import subprocess +import sys + +from utility import utility +import config +import fieldRanking +import xyMapping + +os.nice(19) + +parser = argparse.ArgumentParser("This script executes multiple evaluation scripts.") +parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and " + + "execute anyways", action="store_true") +parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, + help="The folder in which the months directory are " + + "residing.") +parser.add_argument("months", type=str, help="The months to be processed") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +monthsFolder = utility.addMissingSlash(args.monthsFolder) +statisticsSubfolder = monthsFolder + "statistics/" +if not os.path.exists(statisticsSubfolder): + os.makedirs(statisticsSubfolder) + +def fieldRankingOn(monthFolder, metric, filename): + print "Working with fieldRanking " + metric + " on " + filename + fieldRanking.fieldRanking(monthFolder, metric, monthsFolder = args.monthsFolder, outputPath = statisticsSubfolder + metric + "_Ranking", outputFilename = filename, writeOut = True, notifications = False) + +def xyMappingOn(monthFolder, metricOne, metricTwo, filename, nosplitOne = False, nosplitTwo = False): + print "Working with xyMapping " + metricOne + " " + metricTwo + " on " + filename + xyMapping.xyMapping(monthFolder, metricOne, metricTwo, monthsFolder = args.monthsFolder, outputPath = statisticsSubfolder + metricOne + "_" + metricTwo, outputFilename = filename, nosplittingOne = nosplitOne, nosplittingTwo = nosplitTwo, writeOut = True, notifications = False) + +for monthName in args.months.split(","): + cleanMonthName = monthName.strip("/").replace("/", "SLASH") + + if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(monthName) + "locked") and not args.ignoreLock: + print "ERROR: The month " + monthName + " is being edited at the moment. Use -i if you want to force the execution of this script." + continue + + month = utility.addMissingSlash(monthName) + + for secondKey, secondFolder in {"user":"userData", "nonUser":"nonUserData"}.iteritems(): + for thirdKey, thirdFolder in {"all":"", "queryType":"queryTypeDataset"}.iteritems(): + monthFolder = month + secondFolder + "/" + thirdFolder + "/" + monthFolder = monthFolder.strip("/") + + filename = cleanMonthName + "#" + secondKey + "#" + thirdKey + fieldRankingOn(monthFolder, "Predicates", filename) + fieldRankingOn(monthFolder, "Categories", filename) + fieldRankingOn(monthFolder, "TripleCountWithService", filename) + fieldRankingOn(monthFolder, "TripleCountWithoutService", filename) + fieldRankingOn(monthFolder, "ToolName", filename) + fieldRankingOn(monthFolder, "NonSimplePropertyPaths", filename) + fieldRankingOn(monthFolder, "PrimaryLanguage", filename) + fieldRankingOn(monthFolder, "ServiceCalls", filename) + if thirdKey is not "queryType": + fieldRankingOn(monthFolder, "QueryType", filename) + xyMappingOn(monthFolder, "UsedSparqlFeatures", "QuerySize", filename) + for script, scriptFolder in {"getSparqlStatistic.py":"sparqlFeatures", "operatorUsageStatistic.py":"operatorUsage", "generalStat.py":"generalStats"}.iteritems(): + folder = utility.addMissingSlash(statisticsSubfolder + scriptFolder) + if not os.path.exists(folder): + os.makedirs(folder) + print "Working with " + script + " on " + filename + with open(folder + filename, "w") as f: + if subprocess.call(['python', script, monthFolder, '-m', args.monthsFolder, '-p', monthName + "\n" + secondKey + "\n" + thirdKey], stdout = f) != 0: + print "ERROR: Could not calculate " + filename diff --git a/tools/getHourlyMetricCount.py b/tools/getHourlyMetricCount.py index 5889b46..e2427a1 100644 --- a/tools/getHourlyMetricCount.py +++ b/tools/getHourlyMetricCount.py @@ -2,9 +2,9 @@ import os import sys from collections import defaultdict -from postprocess import processdata -from utility import utility -import config +from .postprocess import processdata +from .utility import utility +from . import config parser = argparse.ArgumentParser( description="Counts for a given metric how often it occurs per hour. " @@ -29,7 +29,7 @@ if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") \ and not args.ignoreLock: - print "ERROR: The month " + args.month + " is being edited at the moment." + print("ERROR: The month " + args.month + " is being edited at the moment.") + "Use -i if you want to force the execution of this script." sys.exit() @@ -63,12 +63,12 @@ def saveToFiles(self, outputFolder): os.makedirs(outputFolder + "/" + self.metric) header = "hour\t" + self.metric + "\tcount\n" - for day, data in self.dailyData.iteritems(): + for day, data in self.dailyData.items(): with open(outputFolder + self.metric + "/" + "%02d" % day + "ClassifiedBotsData.tsv", "w") as outputFile: outputFile.write(header) - for hour, metricDict in data.iteritems(): - for metric in metricDict.iterkeys(): + for hour, metricDict in data.items(): + for metric in metricDict.keys(): outputFile.write(str(hour) + "\t" + str(metric) + "\t" + str(data[hour][metric]) + "\n") @@ -76,8 +76,8 @@ def saveToFiles(self, outputFolder): with open(outputFolder + self.metric + "/" + "TotalClassifiedBotsData.tsv", "w") as outputFile: outputFile.write(header) - for hour, metricDict in self.monthlyData.iteritems(): - for metric in metricDict.iterkeys(): + for hour, metricDict in self.monthlyData.items(): + for metric in metricDict.keys(): outputFile.write(str(hour) + "\t" + str(metric) + "\t" + str(self.monthlyData[hour][metric]) + "\n") @@ -87,8 +87,8 @@ def saveToFiles(self, outputFolder): processdata.processMonth(handler, args.month, args.monthsFolder) -print args.monthsFolder + "/" + args.month \ - + "/processedLogData/hourlyMetricCountData" +print(args.monthsFolder + "/" + args.month \ + + "/processedLogData/hourlyMetricCountData") handler.saveToFiles(args.monthsFolder + "/" + args.month + "/processedLogData/hourlyMetricCountData") diff --git a/tools/getHourlyMetricCount.py.bak b/tools/getHourlyMetricCount.py.bak new file mode 100644 index 0000000..5889b46 --- /dev/null +++ b/tools/getHourlyMetricCount.py.bak @@ -0,0 +1,94 @@ +import argparse +import os +import sys +from collections import defaultdict +from postprocess import processdata +from utility import utility +import config + +parser = argparse.ArgumentParser( + description="Counts for a given metric how often it occurs per hour. " + + "Creates then daily and a monthly tsv file containg the hour, the " + + "metric and the queryCount") +parser.add_argument("metric", type=str, help="the metric which we want to " + + "count (without #)") +parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, help="the folder in which the months directory " + + "are residing") +parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and execute" + + " anyways", action="store_true") +parser.add_argument("month", type=str, + help="the month which we're interested in") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + + utility.addMissingSlash(args.month) + "locked") \ + and not args.ignoreLock: + print "ERROR: The month " + args.month + " is being edited at the moment." + + "Use -i if you want to force the execution of this script." + sys.exit() + + +class HourlyMetricCountHandler: + dailyData = dict() + monthlyData = dict() + metric = str + + def __init__(self, metric): + self.metric = metric + + def handle(self, sparqlQuery, processed): + if (processed['#Valid'] == 'VALID' or processed['#Valid'] == '1'): + if (processed['#day'] not in self.dailyData): + self.dailyData[processed['#day']] = dict() + if (processed['#hour'] not in self.dailyData[processed['#day']]): + self.dailyData[processed['#day']][processed['#hour']] \ + = defaultdict(int) + if (processed['#hour'] not in self.monthlyData): + self.monthlyData[processed['#hour']] = defaultdict(int) + + self.dailyData[processed['#day']][processed['#hour']] \ + [processed['#' + self.metric]] += 1 + self.monthlyData[processed['#hour']] \ + [processed['#' + self.metric]] += 1 + + def saveToFiles(self, outputFolder): + outputFolder = outputFolder + "/" + if not os.path.exists(outputFolder + "/" + self.metric): + os.makedirs(outputFolder + "/" + self.metric) + + header = "hour\t" + self.metric + "\tcount\n" + for day, data in self.dailyData.iteritems(): + with open(outputFolder + self.metric + "/" + "%02d" % day + + "ClassifiedBotsData.tsv", "w") as outputFile: + outputFile.write(header) + for hour, metricDict in data.iteritems(): + for metric in metricDict.iterkeys(): + outputFile.write(str(hour) + "\t" + str(metric) + + "\t" + str(data[hour][metric]) + + "\n") + + with open(outputFolder + self.metric + "/" + "TotalClassifiedBotsData.tsv", + "w") as outputFile: + outputFile.write(header) + for hour, metricDict in self.monthlyData.iteritems(): + for metric in metricDict.iterkeys(): + outputFile.write(str(hour) + "\t" + str(metric) + "\t" + + str(self.monthlyData[hour][metric]) + + "\n") + + +handler = HourlyMetricCountHandler(args.metric) + +processdata.processMonth(handler, args.month, args.monthsFolder) + +print args.monthsFolder + "/" + args.month \ + + "/processedLogData/hourlyMetricCountData" + +handler.saveToFiles(args.monthsFolder + "/" + args.month + + "/processedLogData/hourlyMetricCountData") diff --git a/tools/getSparqlStatistic.py b/tools/getSparqlStatistic.py index c983573..df08d0f 100644 --- a/tools/getSparqlStatistic.py +++ b/tools/getSparqlStatistic.py @@ -4,9 +4,9 @@ from collections import defaultdict from pprint import pprint -import config -from postprocess import processdata -from utility import utility +from . import config +from .postprocess import processdata +from .utility import utility parser = argparse.ArgumentParser(description="Prints out the SPARQL statistic") parser.add_argument( @@ -38,9 +38,9 @@ if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") \ and not args.ignoreLock: - print("ERROR: The month " + str(args.month) + + print(("ERROR: The month " + str(args.month) + " is being edited at the moment." + - " Use -i if you want to force the execution of this script.") + " Use -i if you want to force the execution of this script.")) sys.exit() @@ -101,7 +101,7 @@ def printSparqlTranslation(self): self.printKeys(toPrintKeys) print(" ") - print(str(self.totalCount)) + print((str(self.totalCount))) handler = SparqlStatisticHandler() @@ -109,6 +109,6 @@ def printSparqlTranslation(self): processdata.processMonth( handler, args.month, args.monthsFolder, notifications=False) -print args.position +print(args.position) handler.printSparqlTranslation() diff --git a/tools/getSparqlStatistic.py.bak b/tools/getSparqlStatistic.py.bak new file mode 100644 index 0000000..c983573 --- /dev/null +++ b/tools/getSparqlStatistic.py.bak @@ -0,0 +1,114 @@ +import argparse +import os +import sys +from collections import defaultdict +from pprint import pprint + +import config +from postprocess import processdata +from utility import utility + +parser = argparse.ArgumentParser(description="Prints out the SPARQL statistic") +parser.add_argument( + "--monthsFolder", + "-m", + default=config.monthsFolder, + type=str, + help="the folder in which the months directory " + "are residing") +parser.add_argument( + "--ignoreLock", + "-i", + help="Ignore locked file and execute" + " anyways", + action="store_true") +parser.add_argument( + "--position", + "-p", + default="default position", + type=str, + help="The position to be displayed before the data.") +parser.add_argument( + "month", type=str, help="the month which we're interested in") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + + utility.addMissingSlash(args.month) + "locked") \ + and not args.ignoreLock: + print("ERROR: The month " + str(args.month) + + " is being edited at the moment." + + " Use -i if you want to force the execution of this script.") + sys.exit() + + +class SparqlStatisticHandler: + statistic = defaultdict(int) + totalCount = 0 + + def handle(self, sparqlQuery, processed): + if (processed['#Valid'] == 'VALID'): + self.totalCount += 1 + usedSparqlFeatures = processed['#UsedSparqlFeatures'] + + for usedSparqlFeature in usedSparqlFeatures.split(","): + self.statistic[usedSparqlFeature.lstrip()] += 1 + + def printKeys(self, keys): + result = "" + i = 1 + for featureName in keys: + featureCount = self.statistic[featureName] + # result += featureName + ": " + str(featureCount) + "\n" + result += str(featureCount) + "\n" + + i += 1 + + print(result) + + def printSparqlTranslation(self): + self.statistic["Select"] = self.statistic["SelectQuery"] + self.statistic["Ask"] = self.statistic["AskQuery"] + self.statistic["Describe"] = self.statistic["DescribeQuery"] + self.statistic["Construct"] = self.statistic["ConstructQuery"] + self.statistic["Order By"] = self.statistic["OrderClause"] + self.statistic["Union"] = self.statistic["UnionGraphPattern"] + self.statistic["Optional"] = self.statistic["OptionalGraphPattern"] + self.statistic["Not Exists"] = self.statistic["NotExistsFunc"] + self.statistic["Minus"] = self.statistic["MinusGraphPattern"] + self.statistic["Exists"] = self.statistic["ExistsFunc"] + self.statistic["Group By"] = self.statistic["GroupClause"] + self.statistic["Having"] = self.statistic["HavingClause"] + self.statistic["Service"] = self.statistic["ServiceGraphPattern"] + + self.statistic["And"] = self.statistic["Join"] + self.statistic["Values"] = self.statistic["BindingValue"] + self.statistic["'+"] = self.statistic["+"] + + self.statistic["Subquery"] = self.statistic["SubSelect"] + + # only print specified columns + toPrintKeys = [ + "Select", "Ask", "Describe", "Construct", "Distinct", "Limit", + "Offset", "Order By", "Filter", "And", "Union", "Optional", + "Graph", "Not Exists", "Minus", "Exists", "Count", "Max", "Min", + "Avg", "Sum", "Group By", "Having", "Service", "LangService", + "Sample", "Bind", "GroupConcat", "Reduced", "Values", "'+", "*", + "Subquery" + ] + + self.printKeys(toPrintKeys) + print(" ") + print(str(self.totalCount)) + + +handler = SparqlStatisticHandler() + +processdata.processMonth( + handler, args.month, args.monthsFolder, notifications=False) + +print args.position + +handler.printSparqlTranslation() diff --git a/tools/joinMonth.py b/tools/joinMonth.py index 6d7b998..dd8c33b 100644 --- a/tools/joinMonth.py +++ b/tools/joinMonth.py @@ -3,12 +3,12 @@ import os import sys -import config -from utility import utility +from . import config +from .utility import utility def joinMonth(month, monthsFolder = config.monthsFolder, ignoreLock = False, outputPath = None, outputFilename = None): if os.path.isfile(utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "locked") and not ignoreLock: - print "ERROR: The month " + month + " is being edited at the moment. Use -i or ignoreLock = True if you want to force the execution of this script." + print("ERROR: The month " + month + " is being edited at the moment. Use -i or ignoreLock = True if you want to force the execution of this script.") sys.exit() anonymizedFolder = "anonymousRawData/" @@ -31,8 +31,8 @@ def joinMonth(month, monthsFolder = config.monthsFolder, ignoreLock = False, out with gzip.open(targetFile, "w") as target: headerSet = False - for i in xrange(1, 32): - print "Working on %02d" % i + for i in range(1, 32): + print("Working on %02d" % i) sourceFile = pathBase + anonymizedPrefix + "%02d" % i + ".tsv.gz" if not (os.path.exists(sourceFile)): continue diff --git a/tools/joinMonth.py.bak b/tools/joinMonth.py.bak new file mode 100644 index 0000000..6d7b998 --- /dev/null +++ b/tools/joinMonth.py.bak @@ -0,0 +1,68 @@ +import argparse +import gzip +import os +import sys + +import config +from utility import utility + +def joinMonth(month, monthsFolder = config.monthsFolder, ignoreLock = False, outputPath = None, outputFilename = None): + if os.path.isfile(utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "locked") and not ignoreLock: + print "ERROR: The month " + month + " is being edited at the moment. Use -i or ignoreLock = True if you want to force the execution of this script." + sys.exit() + + anonymizedFolder = "anonymousRawData/" + anonymizedPrefix = anonymizedFolder + "AnonymousQueryCnt" + + pathBase = utility.addMissingSlash(monthsFolder) \ + + utility.addMissingSlash(month) + + outputFile = month.strip("/").replace("/", "_") + "_Joined.tsv.gz" + + if outputFilename is not None: + outputFile = outputFilename + + targetFile = pathBase + anonymizedFolder + if outputPath is not None: + targetFile = outputPath + if not os.path.exists(targetFile): + os.makedirs(targetFile) + targetFile += outputFile + + with gzip.open(targetFile, "w") as target: + headerSet = False + for i in xrange(1, 32): + print "Working on %02d" % i + sourceFile = pathBase + anonymizedPrefix + "%02d" % i + ".tsv.gz" + if not (os.path.exists(sourceFile)): + continue + with gzip.open(sourceFile) as source: + if headerSet: + next(source) + else: + headerSet = True + for line in source: + target.write(line) + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="This script joins all the anonymized files into one while keeping only the first header.") + parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, help="The folder in which the months directory " + + "are residing.") + parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and execute" + + " anyways", action="store_true") + parser.add_argument("--outputPath", "-p", type=str, help="The path where the " + + "output file should be generated.") + parser.add_argument("--outputFilename", "-o", type=str, help="The name of the output file to be generated.") + parser.add_argument("months", type=str, + help="the months which we're interested in") + + if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + + args = parser.parse_args() + + for monthName in args.months.split(","): + joinMonth(monthName, monthsFolder = args.monthsFolder, ignoreLock = args.ignoreLock, outputPath = args.outputPath, outputFilename = args.outputFilename) diff --git a/tools/oldScripts/getProperties.py b/tools/oldScripts/getProperties.py index 9b58c3b..b5cef1c 100644 --- a/tools/oldScripts/getProperties.py +++ b/tools/oldScripts/getProperties.py @@ -19,7 +19,7 @@ '}}', 'format': 'json' }).json() - print(len(allProperties)) + print((len(allProperties))) for category in categories['results']['bindings']: for property in allProperties: diff --git a/tools/oldScripts/getProperties.py.bak b/tools/oldScripts/getProperties.py.bak new file mode 100644 index 0000000..9b58c3b --- /dev/null +++ b/tools/oldScripts/getProperties.py.bak @@ -0,0 +1,35 @@ +import json +import os.path + +import requests +import urllib.request + +with urllib.request.urlopen("https://tools.wmflabs.org/hay/propbrowse/props." + + "json") as url: + allProperties = json.loads(url.read().decode()) + + categories = requests.get('https://query.wikidata.org/bigdata/namespace' + + '/wdq/sparql', params={ + 'query': '#TOOL: jgonsior \n' + 'SELECT ?item ' + '?itemLabel WHERE {' + '?item wdt:P279 wd:Q18616576 . ' + 'SERVICE wikibase:label ' + '{bd:serviceParam wikibase:language "en"' + '}}', + 'format': 'json' + }).json() + print(len(allProperties)) + + for category in categories['results']['bindings']: + for property in allProperties: + answer = requests.get( + 'https://query.wikidata.org/bigdata/namespace' + '/wdq/sparql', + params={ + 'query': '#TOOL: jgonsior \n' + 'SELECT ?x ' + '{?x wdt:P31 wd:' + + os.path.basename(category['item']['value']) + '}', + 'format': 'json'}).json() + # pprint(answer) diff --git a/tools/operatorUsageStatistic.py b/tools/operatorUsageStatistic.py index c8d4614..c80fd34 100644 --- a/tools/operatorUsageStatistic.py +++ b/tools/operatorUsageStatistic.py @@ -4,9 +4,9 @@ from collections import defaultdict from pprint import pprint -import config -from postprocess import processdata -from utility import utility +from . import config +from .postprocess import processdata +from .utility import utility import itertools parser = argparse.ArgumentParser( @@ -42,9 +42,9 @@ if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") \ and not args.ignoreLock: - print("ERROR: The month " + str(args.month) + + print(("ERROR: The month " + str(args.month) + " is being edited at the moment." + - " Use -i if you want to force the execution of this script.") + " Use -i if you want to force the execution of this script.")) sys.exit() @@ -132,13 +132,13 @@ def handle(self, sparqlQuery, processed): def printSparqlTranslation(self): result = "" i = 1 - for featureName, featureCount in sorted(self.statistic.iteritems()): + for featureName, featureCount in sorted(self.statistic.items()): #print(featureName + "\t" + str(featureCount)) print(featureCount) i += 1 print("") - print(str(self.totalCount)) + print((str(self.totalCount))) handler = OperatorStatisticHandler() @@ -146,6 +146,6 @@ def printSparqlTranslation(self): processdata.processMonth( handler, args.month, args.monthsFolder, notifications=False) -print args.position +print(args.position) handler.printSparqlTranslation() diff --git a/tools/operatorUsageStatistic.py.bak b/tools/operatorUsageStatistic.py.bak new file mode 100644 index 0000000..c8d4614 --- /dev/null +++ b/tools/operatorUsageStatistic.py.bak @@ -0,0 +1,151 @@ +import argparse +import os +import sys +from collections import defaultdict +from pprint import pprint + +import config +from postprocess import processdata +from utility import utility +import itertools + +parser = argparse.ArgumentParser( + description= + "Prints out the Operator statistic in a nicely formatted way which can be pasted directly into a Google Spreadsheet" +) +parser.add_argument( + "--monthsFolder", + "-m", + default=config.monthsFolder, + type=str, + help="the folder in which the months directory " + "are residing") +parser.add_argument( + "--ignoreLock", + "-i", + help="Ignore locked file and execute" + " anyways", + action="store_true") +parser.add_argument( + "--position", + "-p", + default="default position", + type=str, + help="The position to be displayed before the data.") +parser.add_argument( + "month", type=str, help="the month which we're interested in") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + + utility.addMissingSlash(args.month) + "locked") \ + and not args.ignoreLock: + print("ERROR: The month " + str(args.month) + + " is being edited at the moment." + + " Use -i if you want to force the execution of this script.") + sys.exit() + + +class OperatorStatisticHandler: + statistic = defaultdict(int) + totalCount = 0 + operators = [ + "Filter", "Join", "Union", "Optional", "Values", "Path", "Subquery" + ] + + def __init__(self): + allOperatorsCombinations = set() + + # generate all possible combinations + for i in [1, 2, 3, 4, 5, 6, 7]: + for operator in itertools.combinations(self.operators, i): + allOperatorsCombinations.add(operator) + for operators in allOperatorsCombinations: + self.statistic[", ".join(sorted(operators))] = 0 + + def handle(self, sparqlQuery, processed): + if (processed['#Valid'] == 'VALID'): + self.totalCount += 1 + + usedSparqlFeatures = set() + + langService = False + other = False + service = False + nonService = False + + for usedSparqlFeature in processed['#UsedSparqlFeatures'].split( + ","): + usedSparqlFeature = usedSparqlFeature.lstrip() + if usedSparqlFeature == "UnionGraphPattern": + usedSparqlFeature = "Union" + elif usedSparqlFeature == "OptionalGraphPattern": + usedSparqlFeature = "Optional" + elif usedSparqlFeature == "BindingValue": + usedSparqlFeature = "Values" + elif usedSparqlFeature == "+" or usedSparqlFeature == "*": + usedSparqlFeature = "Path" + elif usedSparqlFeature == "LangService": + usedSparqlFeature = "" + elif usedSparqlFeature == "Filter" or usedSparqlFeature == "ExistsFunc" or usedSparqlFeature == "NotExistsFunc": + usedSparqlFeature = "Filter" + elif usedSparqlFeature == "Join": + usedSparqlFeature = "Join" + elif usedSparqlFeature == "SubSelect": + usedSparqlFeature = "Subquery" + elif usedSparqlFeature == "SelectQuery" or usedSparqlFeature == "ConstructQuery" or usedSparqlFeature == "AskQuery" or usedSparqlFeature == "DescribeQuery" or usedSparqlFeature == "": + usedSparqlFeature = "" + elif usedSparqlFeature == "LangService": + langService = True + usedSparqlFeature = "" + elif usedSparqlFeature == "Service": + service = True + usedSparqlFeature = "" + other = True + elif usedSparqlFeature == "MinusGraphPattern" or usedSparqlFeature == "Bind": + other = True + nonService = True + usedSparqlFeature = "" + + usedSparqlFeatures.add(usedSparqlFeature) + + # this is the case in which there was only a langService -> this + # one is allowed to be NOT in OTHER + if service and langService and not nonService: + other = False + + # check which operators are present: + presentOperators = set() + for operator in self.operators: + if operator in usedSparqlFeatures: + presentOperators.add(operator) + + if len(presentOperators) == 0: + self.statistic["None"] += 1 + elif other: + self.statistic["Other"] += 1 + else: + self.statistic[", ".join(sorted(presentOperators))] += 1 + + def printSparqlTranslation(self): + result = "" + i = 1 + for featureName, featureCount in sorted(self.statistic.iteritems()): + #print(featureName + "\t" + str(featureCount)) + print(featureCount) + i += 1 + + print("") + print(str(self.totalCount)) + + +handler = OperatorStatisticHandler() + +processdata.processMonth( + handler, args.month, args.monthsFolder, notifications=False) + +print args.position + +handler.printSparqlTranslation() diff --git a/tools/plotHourlyMetricCount.py b/tools/plotHourlyMetricCount.py index 079a988..98e7c57 100644 --- a/tools/plotHourlyMetricCount.py +++ b/tools/plotHourlyMetricCount.py @@ -86,7 +86,7 @@ def plotHist(data, title, countTools, xlabel="hour", ylabel="count of queries", inputDirectory = "dayTriple/" files = [] -for i in xrange(1, 3): +for i in range(1, 3): files.append(metricName + "/" + "%02d" % i + "ClassifiedBotsData.tsv") @@ -105,10 +105,10 @@ def compare(item1, item2): totalMetricNames = set() for file in files: - print "Working on: " + file + print("Working on: " + file) day = file.replace(metricName + '/', '').replace("ClassifiedBotsData.tsv", '') - print os.getcwd() + print(os.getcwd()) with open(file) as f: reader = csv.DictReader(f, delimiter="\t") @@ -140,7 +140,7 @@ def compare(item1, item2): data[metric]["Y"].append(count) # sort data so that the log graph is kind of useful - sorted_data = sorted(data.items(), cmp=compare) + sorted_data = sorted(list(data.items()), cmp=compare) plotHist(sorted_data, metricName + '/plots/day' + day, len(set(metrics))) plotHist(sorted_data, metricName + '/plots/log/day' + day, len(set(metrics)), log=True) @@ -154,7 +154,7 @@ def compare(item1, item2): sum += count totalDataPerDay[metric]["Y"].append(sum) -sorted_data = sorted(totalDataPerDay.items(), cmp=compare) +sorted_data = sorted(list(totalDataPerDay.items()), cmp=compare) plotHist(sorted_data, metricName + "/plots/total", len(totalMetricNames), xlabel='day') plotHist(sorted_data, metricName + "/plots/total_log", len(totalMetricNames), xlabel='day', diff --git a/tools/plotHourlyMetricCount.py.bak b/tools/plotHourlyMetricCount.py.bak new file mode 100644 index 0000000..079a988 --- /dev/null +++ b/tools/plotHourlyMetricCount.py.bak @@ -0,0 +1,161 @@ +import argparse +import csv +import os + +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +import numpy as np +import sys +from matplotlib.pyplot import cm + +parser = argparse.ArgumentParser( + description="Generates hourly/daily/monthly plots per metric, please run getHourlyMetricCount.py before") +parser.add_argument("--monthsFolder", "-m", type=str, default="/a/akrausetud/months", + help="the folder in which the months directory are residing") +parser.add_argument("metric", type=str, help="the metric which we want to count (without #)") +parser.add_argument("month", default="/a/akrausetud/month", type=str, help="the month which we're interested in") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +workingDir = args.monthsFolder + "/" + args.month + "/processedLogData/hourlyMetricCountData" +os.chdir(workingDir) + +metricName = args.metric + + +def plotHist(data, title, countTools, xlabel="hour", ylabel="count of queries", log=False): + if not os.path.exists(title[:title.rfind("/")]): + os.makedirs(title[:title.rfind("/")]) + fig = plt.figure(1) + ax = fig.add_subplot(111) + plt.grid(True) + + plt.title(title) + plt.xlabel(xlabel) + plt.ylabel(ylabel) + + axes = plt.axes() + axes.xaxis.set_major_locator(ticker.MultipleLocator(1)) + + if log: + axes.set_yscale('log') + + colormap = cm.nipy_spectral(np.linspace(0, 1, countTools)) + color = iter(colormap) + + minMetricValue = data[0][1]["Y"][0] + maxMetricValue = 0 + + for dataPoint in data: + c = next((color)) + XY = dataPoint[1] + + if (XY["Y"][0] < minMetricValue): + minMetricValue = XY["Y"][0] + + if (XY["X"][0] > maxMetricValue): + maxMetricValue = XY["Y"][0] + + try: + ax.bar(XY["X"], XY["Y"], align='center', color=c, edgecolor=c) + except ValueError: + pass + + scalarMappaple = cm.ScalarMappable(cmap=cm.nipy_spectral, + norm=plt.Normalize(vmin=minMetricValue, vmax=maxMetricValue)) + scalarMappaple._A = [] + plt.colorbar(scalarMappaple) + + if xlabel is 'hour': + plt.xlim(-1, 24) + + if xlabel is 'day': + plt.xlim(0, 32) + + plt.xticks(fontsize=9) + + plt.savefig(title + ".png", bbox_inches='tight') + + plt.close() + + +inputDirectory = "dayTriple/" + +files = [] +for i in xrange(1, 3): + files.append(metricName + "/" + "%02d" % i + "ClassifiedBotsData.tsv") + + +# files.append("classifiedBotsData/TotalClassifiedBotsData.tsv") + +def compare(item1, item2): + if max(item1[1]["Y"]) < max(item2[1]["Y"]): + return 1 + elif max(item1[1]["Y"]) == max(item2[1]["Y"]): + return 0 + else: + return -1 + + +totalDataPerDay = {} +totalMetricNames = set() + +for file in files: + print "Working on: " + file + + day = file.replace(metricName + '/', '').replace("ClassifiedBotsData.tsv", '') + print os.getcwd() + with open(file) as f: + reader = csv.DictReader(f, delimiter="\t") + + hours = [] + metrics = [] + counts = [] + + for line in reader: + hours.append(int(line['hour'])) + metrics.append(line[metricName]) + totalMetricNames.add(line[metricName]) + counts.append(int(line['count'])) + + # divide data into "stacks" + + data = {} + for metric in metrics: + if metric not in totalDataPerDay: + totalDataPerDay[metric] = {} + totalDataPerDay[metric]["X"] = list() + totalDataPerDay[metric]["Y"] = list() + + data[metric] = {} + data[metric]["X"] = list() + data[metric]["Y"] = list() + + for hour, metric, count in zip(hours, metrics, counts): + data[metric]["X"].append(hour) + data[metric]["Y"].append(count) + + # sort data so that the log graph is kind of useful + sorted_data = sorted(data.items(), cmp=compare) + + plotHist(sorted_data, metricName + '/plots/day' + day, len(set(metrics))) + plotHist(sorted_data, metricName + '/plots/log/day' + day, len(set(metrics)), log=True) + + for metric in totalMetricNames: + if metric in data: + totalDataPerDay[metric]["X"].append(int(day)) + + sum = 0 + for count in data[metric]["Y"]: + sum += count + totalDataPerDay[metric]["Y"].append(sum) + +sorted_data = sorted(totalDataPerDay.items(), cmp=compare) + +plotHist(sorted_data, metricName + "/plots/total", len(totalMetricNames), xlabel='day') +plotHist(sorted_data, metricName + "/plots/total_log", len(totalMetricNames), xlabel='day', + log=True) diff --git a/tools/postprocess/processdata.py b/tools/postprocess/processdata.py index e96aac2..69057ab 100644 --- a/tools/postprocess/processdata.py +++ b/tools/postprocess/processdata.py @@ -3,11 +3,11 @@ import glob import gzip import os -import urllib -import urlparse +import urllib.request, urllib.parse, urllib.error +import urllib.parse from pprint import pprint import sys -from itertools import izip + from utility import utility @@ -47,14 +47,14 @@ def processMonth(handler, month, monthsFolder, anonymous = False, notifications def processDay(handler, day, month, monthsFolder, - startIdx=0, endIdx=sys.maxint, notifications = True): + startIdx=0, endIdx=sys.maxsize, notifications = True): processedFileName = utility.addMissingSlash(monthsFolder) \ + utility.addMissingSlash(month) \ + processedFolder + processedPrefix + "%02d" % day \ + processedSuffix if notifications: - print "Working on: " + processedFileName + print("Working on: " + processedFileName) with gzip.open(processedFileName) as p, \ gzip.open(utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "rawLogData/" @@ -63,12 +63,12 @@ def processDay(handler, day, month, monthsFolder, sReader = csv.DictReader(s, delimiter="\t") i = 0 - for processed, source in izip(pReader, sReader): + for processed, source in zip(pReader, sReader): if startIdx <= i <= endIdx: - requestParameters = dict(urlparse.parse_qsl(urlparse.urlsplit( + requestParameters = dict(urllib.parse.parse_qsl(urllib.parse.urlsplit( source['uri_query']).query.replace(';', "%3B"))) - if 'query' in requestParameters.keys(): + if 'query' in list(requestParameters.keys()): sparqlQuery = requestParameters['query'] else: sparqlQuery = None @@ -84,20 +84,20 @@ def processDay(handler, day, month, monthsFolder, break i += 1 -def processDayAnonymous(handler, day, month, monthsFolder, startIdx=0, endIdx=sys.maxint, notifications = True): +def processDayAnonymous(handler, day, month, monthsFolder, startIdx=0, endIdx=sys.maxsize, notifications = True): anonymousFileName = utility.addMissingSlash(monthsFolder) \ + utility.addMissingSlash(month) \ + anonymousDataFolder + anonymousFilePrefix + "%02d" % day + anonymousFileSuffix if notifications: - print "Working on: " + anonymousFileName + print("Working on: " + anonymousFileName) with gzip.open(anonymousFileName) as a: aReader = csv.DictReader(a, delimiter="\t") i = 0 for anonymous in aReader: if startIdx <= i <= endIdx: - sparqlQuery = urllib.unquote_plus(anonymous['#anonymizedQuery']) + sparqlQuery = urllib.parse.unquote_plus(anonymous['#anonymizedQuery']) anonymous['Valid'] = 'VALID' handler.handle(sparqlQuery, anonymous) @@ -105,11 +105,11 @@ def processDayAnonymous(handler, day, month, monthsFolder, startIdx=0, endIdx=sy break i += 1 -def processRankedQueryType(handler, month, monthsFolder, startIdx = 0, endIdx = sys.maxint, notifications = True): +def processRankedQueryType(handler, month, monthsFolder, startIdx = 0, endIdx = sys.maxsize, notifications = True): rankedQueryTypeFilename = utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + rankedQueryTypeFolder + rankedQueryTypeFile if notifications: - print "Working on: " + rankedQueryTypeFilename + print("Working on: " + rankedQueryTypeFilename) with open(rankedQueryTypeFilename) as r: rReader = csv.DictReader(r, delimiter = "\t") diff --git a/tools/postprocess/processdata.py.bak b/tools/postprocess/processdata.py.bak new file mode 100644 index 0000000..e96aac2 --- /dev/null +++ b/tools/postprocess/processdata.py.bak @@ -0,0 +1,123 @@ +# -*- coding: utf-8 -*- +import csv +import glob +import gzip +import os +import urllib +import urlparse +from pprint import pprint +import sys +from itertools import izip + +from utility import utility + +processedFolder = "processedLogData/" +processedPrefix = "QueryProcessedOpenRDF" +processedSuffix = ".tsv.gz" +sourcePrefix = "QueryCnt" + +anonymousDataFolder = "anonymousRawData/" +anonymousFilePrefix = "AnonymousQueryCnt" +anonymousFileSuffix = ".tsv.gz" + +rankedQueryTypeFolder = "queryTypeDataRanking/" +rankedQueryTypeFile = "Query_Type_Data_Ranking.tsv" + + +def processMonth(handler, month, monthsFolder, anonymous = False, notifications = True): + folderToSearch = processedFolder + prefixToSearch = processedPrefix + suffixToSearch = processedSuffix + + if anonymous: + folderToSearch = anonymousDataFolder + prefixToSearch = anonymousFilePrefix + suffixToSearch = anonymousFileSuffix + + for filename in glob.glob(utility.addMissingSlash(monthsFolder) + + utility.addMissingSlash(month) + + folderToSearch + prefixToSearch + "*" + + suffixToSearch): + day = os.path.basename(filename)[len( + prefixToSearch):][:-len(suffixToSearch)] + if anonymous: + processDayAnonymous(handler, int(day), month, monthsFolder, notifications = notifications) + else: + processDay(handler, int(day), month, monthsFolder, notifications = notifications) + + +def processDay(handler, day, month, monthsFolder, + startIdx=0, endIdx=sys.maxint, notifications = True): + processedFileName = utility.addMissingSlash(monthsFolder) \ + + utility.addMissingSlash(month) \ + + processedFolder + processedPrefix + "%02d" % day \ + + processedSuffix + + if notifications: + print "Working on: " + processedFileName + with gzip.open(processedFileName) as p, \ + gzip.open(utility.addMissingSlash(monthsFolder) + + utility.addMissingSlash(month) + "rawLogData/" + + sourcePrefix + "%02d" % day + ".tsv.gz") as s: + pReader = csv.DictReader(p, delimiter="\t") + sReader = csv.DictReader(s, delimiter="\t") + + i = 0 + for processed, source in izip(pReader, sReader): + if startIdx <= i <= endIdx: + requestParameters = dict(urlparse.parse_qsl(urlparse.urlsplit( + source['uri_query']).query.replace(';', "%3B"))) + + if 'query' in requestParameters.keys(): + sparqlQuery = requestParameters['query'] + else: + sparqlQuery = None + + processed['hour'] = source['hour'] + processed['day'] = day + processed['user_agent'] = source['user_agent'] + processed['http_status'] = source['http_status'] + processed['timestamp'] = source['ts'] + processed['ts'] = source['ts'] + handler.handle(sparqlQuery, processed) + elif i > endIdx: + break + i += 1 + +def processDayAnonymous(handler, day, month, monthsFolder, startIdx=0, endIdx=sys.maxint, notifications = True): + anonymousFileName = utility.addMissingSlash(monthsFolder) \ + + utility.addMissingSlash(month) \ + + anonymousDataFolder + anonymousFilePrefix + "%02d" % day + anonymousFileSuffix + + if notifications: + print "Working on: " + anonymousFileName + with gzip.open(anonymousFileName) as a: + aReader = csv.DictReader(a, delimiter="\t") + + i = 0 + for anonymous in aReader: + if startIdx <= i <= endIdx: + sparqlQuery = urllib.unquote_plus(anonymous['#anonymizedQuery']) + + anonymous['Valid'] = 'VALID' + handler.handle(sparqlQuery, anonymous) + elif i > endIdx: + break + i += 1 + +def processRankedQueryType(handler, month, monthsFolder, startIdx = 0, endIdx = sys.maxint, notifications = True): + rankedQueryTypeFilename = utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + rankedQueryTypeFolder + rankedQueryTypeFile + + if notifications: + print "Working on: " + rankedQueryTypeFilename + + with open(rankedQueryTypeFilename) as r: + rReader = csv.DictReader(r, delimiter = "\t") + + i = 0 + for ranked in rReader: + if startIdx <= i <= endIdx: + handler.handle(ranked["ExampleQuery"], ranked) + elif i > endIdx: + break + i += 1 diff --git a/tools/rankDataTypes.py b/tools/rankDataTypes.py index 841135a..a3c096e 100644 --- a/tools/rankDataTypes.py +++ b/tools/rankDataTypes.py @@ -5,10 +5,10 @@ from collections import defaultdict -import config +from . import config -from postprocess import processdata -from utility import utility +from .postprocess import processdata +from .utility import utility parser = argparse.ArgumentParser( description="Tool to rank the used data types") @@ -30,7 +30,7 @@ if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") \ and not args.ignoreLock: - print "ERROR: The month " + args.month + " is being edited at the " + print("ERROR: The month " + args.month + " is being edited at the ") + "moment. Use -i if you want to force the execution of this script." sys.exit() @@ -49,6 +49,6 @@ def handle(self, sparqlQuery, processed): else: processdata.processMonth(handler, args.month, args.monthsFolder) -print "count\tdataType" -for k, v in sorted(ranking.iteritems(), key=lambda (k, v): (v, k), reverse=True): - print str(v) + "\t" + k +print("count\tdataType") +for k, v in sorted(iter(ranking.items()), key=lambda k_v: (k_v[1], k_v[0]), reverse=True): + print(str(v) + "\t" + k) diff --git a/tools/rankDataTypes.py.bak b/tools/rankDataTypes.py.bak new file mode 100644 index 0000000..841135a --- /dev/null +++ b/tools/rankDataTypes.py.bak @@ -0,0 +1,54 @@ +import argparse +import os +import re +import sys + +from collections import defaultdict + +import config + +from postprocess import processdata +from utility import utility + +parser = argparse.ArgumentParser( + description="Tool to rank the used data types") +parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, help="The folder in which the months directory" + + " are residing.") +parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and " + + "execute anyways", action="store_true") +parser.add_argument("month", type=str, help="The month from which lines " + + "should be displayed.") +parser.add_argument("--anonymous", "-a", action="store_true", help="Check to switch to ranking the anonymous data.") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + + utility.addMissingSlash(args.month) + "locked") \ + and not args.ignoreLock: + print "ERROR: The month " + args.month + " is being edited at the " + + "moment. Use -i if you want to force the execution of this script." + sys.exit() + +ranking = defaultdict(int) + +class rankDataTypesHandler: + + def handle(self, sparqlQuery, processed): + for entry in re.findall(r'\^\^(.*?)( |\)|\\n)', str(sparqlQuery)): + ranking[entry[0]] += 1 + +handler = rankDataTypesHandler() + +if args.anonymous: + processdata.processMonth(handler, args.month, args.monthsFolder, anonymous = True) +else: + processdata.processMonth(handler, args.month, args.monthsFolder) + +print "count\tdataType" +for k, v in sorted(ranking.iteritems(), key=lambda (k, v): (v, k), reverse=True): + print str(v) + "\t" + k diff --git a/tools/showData.py b/tools/showData.py index 5e71c4c..099dfcd 100644 --- a/tools/showData.py +++ b/tools/showData.py @@ -4,9 +4,9 @@ import os import sys from tabulate import tabulate -from postprocess import processdata -from utility import utility -import config +from .postprocess import processdata +from .utility import utility +from . import config from pprint import pprint parser = argparse.ArgumentParser( @@ -20,7 +20,7 @@ + "only valid lines are being looked at") parser.add_argument("--startline", "-s", default=0, type=int, help="The line" + " from which we want to start displaying the data.") -parser.add_argument("--endline", "-e", default=sys.maxint, type=int, +parser.add_argument("--endline", "-e", default=sys.maxsize, type=int, help="The line where we want to stop displaying the data.") parser.add_argument("--line", "-l", type=int, help="Set if you only want to display one specific line.") parser.add_argument("--day", "-d", default = 1, type=int, help="The day of the month from which " @@ -54,7 +54,7 @@ if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") \ and not args.ignoreLock: - print "ERROR: The month " + args.month + " is being edited at the " + print("ERROR: The month " + args.month + " is being edited at the ") + "moment. Use -i if you want to force the execution of this script." sys.exit() @@ -86,13 +86,13 @@ def handle(self, sparqlQuery, processed): for metric in metrics: data[0].append(processed[metric]) - print tabulate(data, headers=metrics) - print "Query:" + print(tabulate(data, headers=metrics)) + print("Query:") if sparqlQuery is None: - print "Error: Could not find query in uri_query." + print("Error: Could not find query in uri_query.") else: - print sparqlQuery - print "" + print(sparqlQuery) + print("") handler = ViewDataHandler() diff --git a/tools/showData.py.bak b/tools/showData.py.bak new file mode 100644 index 0000000..5e71c4c --- /dev/null +++ b/tools/showData.py.bak @@ -0,0 +1,105 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +import argparse +import os +import sys +from tabulate import tabulate +from postprocess import processdata +from utility import utility +import config +from pprint import pprint + +parser = argparse.ArgumentParser( + description="Tool to view the content of the processed query logs") +parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, help="The folder in which the months directory" + + " are residing.") +parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and " + + "execute anyways", action="store_true") +parser.add_argument('--onlyValid', "-o", action='store_true', help="If set " + + "only valid lines are being looked at") +parser.add_argument("--startline", "-s", default=0, type=int, help="The line" + + " from which we want to start displaying the data.") +parser.add_argument("--endline", "-e", default=sys.maxint, type=int, + help="The line where we want to stop displaying the data.") +parser.add_argument("--line", "-l", type=int, help="Set if you only want to display one specific line.") +parser.add_argument("--day", "-d", default = 1, type=int, help="The day of the month from which " + + "lines should be displayed. Default is 1.") +parser.add_argument("month", type=str, help="The month from which lines " + + "should be displayed.") +parser.add_argument("--metricsToBeViewed", "-v", default = "", type=str, help="The metrics that should be " + + "show, separated by comma (e.g QuerySize,QueryType)") +parser.add_argument("--metricsNotNull", "-n", default="", type=str, + help="The list of metrics that shouldn't be null, " + + "separated by comma") +parser.add_argument("--anonymous", "-a", action="store_true", help="Check to switch to viewing the anonymous data." + + " WARNING: No processed metrics are available for anonymous data because the anonymous files" + + " do not synch up to the processed files due to dropping the invalid lines.") +parser.add_argument("--queryTypeRanking", "-q", action="store_true", help="Display lines from the ranked query type file." + + " NOTE: The day setting is ignored if query type ranking is enabled.") + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +startLine = args.startline +endLine = args.endline + +if args.line != None: + startLine = args.line + endLine = args.line + +if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + + utility.addMissingSlash(args.month) + "locked") \ + and not args.ignoreLock: + print "ERROR: The month " + args.month + " is being edited at the " + + "moment. Use -i if you want to force the execution of this script." + sys.exit() + +metrics = list() +metricsNotNull = list() + +if args.metricsToBeViewed is not "": + for metric in args.metricsToBeViewed.split(","): + metrics.append(utility.addMissingDoubleCross(metric)) + +if args.metricsNotNull is not "": + for metric in args.metricsNotNull.split(","): + metricsNotNull.append(utility.addMissingDoubleCross(metric)) + + +class ViewDataHandler: + + def handle(self, sparqlQuery, processed): + data = [[]] + if args.onlyValid: + if processed['#Valid'] is not 'VALID': + return + + for metricNotNull in metricsNotNull: + if processed[metricNotNull] is '' \ + or processed[metricNotNull] is "NONE" \ + or processed[metricNotNull] is 0: + return + + for metric in metrics: + data[0].append(processed[metric]) + print tabulate(data, headers=metrics) + print "Query:" + if sparqlQuery is None: + print "Error: Could not find query in uri_query." + else: + print sparqlQuery + print "" + + +handler = ViewDataHandler() + +if args.anonymous: + processdata.processDayAnonymous(handler, args.day, args.month, args.monthsFolder, startIdx=startLine, endIdx=endLine) +elif args.queryTypeRanking: + processdata.processRankedQueryType(handler, args.month, args.monthsFolder, startIdx = startLine, endIdx = endLine) +else: + processdata.processDay(handler, args.day, args.month, args.monthsFolder, startIdx=startLine, endIdx=endLine) diff --git a/tools/sortByTime.py b/tools/sortByTime.py index 6fb7dbd..85163f5 100644 --- a/tools/sortByTime.py +++ b/tools/sortByTime.py @@ -7,9 +7,9 @@ import pandas -import config +from . import config -from utility import utility +from .utility import utility parser = argparse.ArgumentParser(description="Sorts the raw log files by timestamp.") parser.add_argument("--monthsFolder", "-m", @@ -37,18 +37,18 @@ key = "ts" if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") and not args.ignoreLock: - print "ERROR: The month " + args.month + " is being edited at the moment. Use -i if you want to force the execution of this script." + print("ERROR: The month " + args.month + " is being edited at the moment. Use -i if you want to force the execution of this script.") sys.exit() os.chdir(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month)) -for i in xrange(1, 32): +for i in range(1, 32): filename = sourcePrefix + "%02d" % i filename_gzip = filename + ".tsv.gz" filename_tsv = filename + ".tsv" if not os.path.exists(filename_gzip): continue - print "Working on %02d" % i + print("Working on %02d" % i) with gzip.open(filename_gzip, 'rb') as input_file, open(filename_tsv, 'wb') as output_file: shutil.copyfileobj(input_file, output_file) os.remove(filename_gzip) diff --git a/tools/sortByTime.py.bak b/tools/sortByTime.py.bak new file mode 100644 index 0000000..6fb7dbd --- /dev/null +++ b/tools/sortByTime.py.bak @@ -0,0 +1,60 @@ +import argparse +import csv +import gzip +import os +import shutil +import sys + +import pandas + +import config + +from utility import utility + +parser = argparse.ArgumentParser(description="Sorts the raw log files by timestamp.") +parser.add_argument("--monthsFolder", "-m", + default=config.monthsFolder, type=str, + help="The folder in which the months directories are" + + " residing.") +parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and execute" + + " anyways", action="store_true") +parser.add_argument("--anonymous", "-a", action="store_true", help="Check to switch to sort the anonymous data.") +parser.add_argument("month", type=str, + help="The month whose raw log files should be sorted.") + + +if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + +args = parser.parse_args() + +if args.anonymous: + sourcePrefix = config.anonymousPrefix + key = "timestamp" +else: + sourcePrefix = config.sourcePrefix + key = "ts" + +if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") and not args.ignoreLock: + print "ERROR: The month " + args.month + " is being edited at the moment. Use -i if you want to force the execution of this script." + sys.exit() + +os.chdir(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month)) + +for i in xrange(1, 32): + filename = sourcePrefix + "%02d" % i + filename_gzip = filename + ".tsv.gz" + filename_tsv = filename + ".tsv" + if not os.path.exists(filename_gzip): + continue + print "Working on %02d" % i + with gzip.open(filename_gzip, 'rb') as input_file, open(filename_tsv, 'wb') as output_file: + shutil.copyfileobj(input_file, output_file) + os.remove(filename_gzip) + df = pandas.read_csv(filename_tsv, sep="\t", header=0, index_col=0) + df = df.sort_values(by=[key]) + df.to_csv(filename_tsv, sep="\t") + with open (filename_tsv, 'rb') as input_file, gzip.open(filename_gzip, 'wb') as output_file: + shutil.copyfileobj(input_file, output_file) + os.remove(filename_tsv) diff --git a/tools/utility/utility.py b/tools/utility/utility.py index ac4b315..34d865d 100644 --- a/tools/utility/utility.py +++ b/tools/utility/utility.py @@ -28,15 +28,15 @@ def fetchEntries(processed, metric, nosplitting = False): try: hour = int(processed["hour"]) except ValueError: - print processed["hour"] + " could not be parsed as integer" + print(processed["hour"] + " could not be parsed as integer") return [] - if hour not in xrange(0,24): - print str(hour) + " is not in 0-23" + if hour not in range(0,24): + print(str(hour) + " is not in 0-23") return [] try: day = int(processed["day"]) except: - print processed["day"] + " could not be parsed as integer" + print(processed["day"] + " could not be parsed as integer") return [] return [hour + 24 * (day - 1)] @@ -54,7 +54,7 @@ def fetchEntries(processed, metric, nosplitting = False): def splitEntry(entry): field_array = entry.split(",") - field_array = map(lambda it: it.strip(), field_array) + field_array = [it.strip() for it in field_array] field_array = [x for x in field_array if x] return field_array @@ -75,7 +75,7 @@ def setup(self, filterParameter): self.parameters[arguments[0]] = re.compile(arguments[1]) def checkLine(self, processed): - for key, value in self.parameters.iteritems(): + for key, value in self.parameters.items(): match = re.match(value, str(processed[key])) if match == None: return False diff --git a/tools/utility/utility.py.bak b/tools/utility/utility.py.bak new file mode 100644 index 0000000..ac4b315 --- /dev/null +++ b/tools/utility/utility.py.bak @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- + +import re + +# This list contains all fields that should not be split because they could contain commas +notToSplit = ["user_agent", "ToolName"] + +def listToString(list): + returnString = "" + for entry in list: + returnString += entry + "," + return returnString[:-1] + +def addMissingSlash(directoryString): + if not directoryString.endswith("/"): + return directoryString + "/" + return directoryString + +def argMetric(metric): + if metric.startswith("#"): + return metric[1:] + else: + return metric + +def fetchEntries(processed, metric, nosplitting = False): + metric = argMetric(metric) + if metric == "monthly_hour": + try: + hour = int(processed["hour"]) + except ValueError: + print processed["hour"] + " could not be parsed as integer" + return [] + if hour not in xrange(0,24): + print str(hour) + " is not in 0-23" + return [] + try: + day = int(processed["day"]) + except: + print processed["day"] + " could not be parsed as integer" + return [] + + return [hour + 24 * (day - 1)] + else: + data = processed[metric] + if metric in notToSplit: + return [data] + else: + field_array = splitEntry(data) + if nosplitting: + field_array = sorted(field_array) + return [listToString(field_array)] + else: + return field_array + +def splitEntry(entry): + field_array = entry.split(",") + field_array = map(lambda it: it.strip(), field_array) + field_array = [x for x in field_array if x] + return field_array + +class filter: + + parameters = dict() + + def setup(self, filterParameter): + self.parameters["Valid"] = re.compile("^VALID$") + + filters = filterParameter.split(",") + + if filters == ['']: + return + + for element in filters: + arguments = element.split("=") + self.parameters[arguments[0]] = re.compile(arguments[1]) + + def checkLine(self, processed): + for key, value in self.parameters.iteritems(): + match = re.match(value, str(processed[key])) + if match == None: + return False + return True diff --git a/tools/visualisePropertyTree/createHtml.py b/tools/visualisePropertyTree/createHtml.py index e1ed8cd..6149fac 100644 --- a/tools/visualisePropertyTree/createHtml.py +++ b/tools/visualisePropertyTree/createHtml.py @@ -15,7 +15,7 @@ rankings = dict() -print("Working on: " + args.rankingFile) +print(("Working on: " + args.rankingFile)) with open(args.rankingFile, "r") as rankingFile: rankingReader = csv.DictReader(rankingFile, delimiter="\t") diff --git a/tools/visualisePropertyTree/createHtml.py.bak b/tools/visualisePropertyTree/createHtml.py.bak new file mode 100644 index 0000000..e1ed8cd --- /dev/null +++ b/tools/visualisePropertyTree/createHtml.py.bak @@ -0,0 +1,165 @@ +import argparse +import csv +from functools import cmp_to_key + +from SPARQLWrapper import SPARQLWrapper, JSON + +parser = argparse.ArgumentParser( + description="Generates based on the content of the ranking.tsv file a " + + "JavaScript Json Object which contains detailed usage information " + + "about each property") +parser.add_argument("--rankingFile", "-r", default="ranking.tsv", type=str, + help="the file which contains the ranking information") + +args = parser.parse_args() + +rankings = dict() + +print("Working on: " + args.rankingFile) +with open(args.rankingFile, "r") as rankingFile: + rankingReader = csv.DictReader(rankingFile, delimiter="\t") + + for ranking in rankingReader: + rankings["http://www.wikidata.org/entity/" + ranking["Categories"]] = ( + int(ranking["Categories_count"]), float(ranking["percentage"])) + +# pprint(rankings) + +sparql = SPARQLWrapper("https://query.wikidata.org/sparql") + +sparql.setQuery(""" + #Tool: jgonsior-tree + SELECT ?property ?propertyLabel ?subclass0 ?subclass0Label + ?subclass1 ?subclass1Label ?subclass2 ?subclass2Label + ?subclass3 ?subclass3Label ?subclass4 ?subclass4Label + ?subclass5 ?subclass5Label ?subclass6 ?subclass6Label + ?subclass7 ?subclass7Label ?subclass8 ?subclass8Label + ?subclass9 ?subclass9Label + WHERE {{ BIND (wd:Q18616576 as ?property) + ?subclass0 wdt:P279 ?property . + OPTIONAL {?subclass1 wdt:P279 ?subclass0 . + OPTIONAL {?subclass2 wdt:P279 ?subclass1 . + OPTIONAL {?subclass3 wdt:P279 ?subclass2 . + OPTIONAL {?subclass4 wdt:P279 ?subclass3 . + OPTIONAL {?subclass5 wdt:P279 ?subclass4 . + OPTIONAL {?subclass6 wdt:P279 ?subclass5 . + OPTIONAL {?subclass7 wdt:P279 ?subclass6 . + OPTIONAL {?subclass8 wdt:P279 ?subclass7 . + OPTIONAL {?subclass9 wdt:P279 ?subclass8 .}}}}}}}}}} + SERVICE wikibase:label + { bd:serviceParam wikibase:language "en" }} + """) + +sparql.setReturnFormat(JSON) +sparqlResult = sparql.query().convert() + + +def createProperty(name, qid): + property = dict() + property['name'] = name + property['qid'] = qid + if qid in rankings: + property['countUserQueries'] = rankings[qid][0] + else: + property['countUserQueries'] = 0 + + property['children'] = list() + return property + + +def searchPropertyInTree(treeRoot, qid): + for possibleProperty in treeRoot['children']: + if possibleProperty['qid'] == qid: + return possibleProperty + return None + + +rootProperty = createProperty("/", "/") + +for property in sparqlResult['results']['bindings']: + parentProperty = searchPropertyInTree(rootProperty, + property['property']['value']) + if parentProperty is None: + parentProperty = createProperty(property['propertyLabel']['value'], + property['property']['value']) + rootProperty['children'].append(parentProperty) + + for i in range(0, 9): + if 'subclass' + str(i) in property: + childProperty = searchPropertyInTree( + parentProperty, + property['subclass' + str(i)]['value']) + if childProperty is None: + childProperty = createProperty( + property['subclass' + str(i) + "Label"]['value'], + property['subclass' + str(i)]['value']) + parentProperty['children'].append(childProperty) + parentProperty = childProperty + else: + break + + +def compareProperties(property1, property2): + if property1['countUserQueries'] < property2['countUserQueries']: + return -1 + elif property1['countUserQueries'] > property2['countUserQueries']: + return 1 + else: + return 0 + + +html = """ + + + + + + + + + + + + + + + + + + + + +""" + + +def createTr(property, parent, parentPrefix): + html = "" + parentPrefix += parent['qid'] + html += "" + html += "" + html += "" + html += "" + + html += "\n" + for child in sorted(property['children'], reverse=True, + key=cmp_to_key(compareProperties)): + html += createTr(child, property, parentPrefix) + + return html + + +html += createTr(rootProperty['children'][0], rootProperty, "") +html += """ + +
Property labelQIDUser Queries which had this query
" + property['name'] + "" + property['qid'][31:] + "" + str(property['countUserQueries']) \ + + "
+ + + + + +""" + +with open("index.html", "w") as htmlFile: + htmlFile.write(html) diff --git a/tools/xyMapping.py b/tools/xyMapping.py index 64bb835..8238a2b 100644 --- a/tools/xyMapping.py +++ b/tools/xyMapping.py @@ -4,10 +4,10 @@ from collections import defaultdict -import config +from . import config -from postprocess import processdata -from utility import utility +from .postprocess import processdata +from .utility import utility def writeOutMethod(filename, fieldValues, dictionary, headerStart): with open(filename, "w") as file: @@ -18,7 +18,7 @@ def writeOutMethod(filename, fieldValues, dictionary, headerStart): for j in sorted(dictionary.keys()): line = str(j) for field in sorted(fieldValues): - if field in dictionary[j].keys(): + if field in list(dictionary[j].keys()): line += "\t" + str(dictionary[j][field]) else: line += "\t0" @@ -30,7 +30,7 @@ def xyMapping(month, metricOne, metricTwo, monthsFolder = config.monthsFolder, i if os.path.isfile(utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "locked") \ and not ignoreLock: - print "ERROR: The month " + month + " is being edited at the " + print("ERROR: The month " + month + " is being edited at the ") + "moment. Use -i if you want to force the execution of this script." sys.exit() diff --git a/tools/xyMapping.py.bak b/tools/xyMapping.py.bak new file mode 100644 index 0000000..64bb835 --- /dev/null +++ b/tools/xyMapping.py.bak @@ -0,0 +1,134 @@ +import argparse +import os +import sys + +from collections import defaultdict + +import config + +from postprocess import processdata +from utility import utility + +def writeOutMethod(filename, fieldValues, dictionary, headerStart): + with open(filename, "w") as file: + header = headerStart + for field in sorted(fieldValues): + header += "\t" + field + file.write(header + "\n") + for j in sorted(dictionary.keys()): + line = str(j) + for field in sorted(fieldValues): + if field in dictionary[j].keys(): + line += "\t" + str(dictionary[j][field]) + else: + line += "\t0" + file.write(line + "\n") + + + +def xyMapping(month, metricOne, metricTwo, monthsFolder = config.monthsFolder, ignoreLock = False, outputPath = None, outputFilename = None, filterParams = "", nosplittingOne = False, nosplittingTwo = False, writeOut = False, notifications = True): + if os.path.isfile(utility.addMissingSlash(monthsFolder) + + utility.addMissingSlash(month) + "locked") \ + and not ignoreLock: + print "ERROR: The month " + month + " is being edited at the " + + "moment. Use -i if you want to force the execution of this script." + sys.exit() + + metricOne = utility.argMetric(metricOne) + metricTwo = utility.argMetric(metricTwo) + + folderName = metricOne + "_" + metricTwo + + pathBase = utility.addMissingSlash(monthsFolder) \ + + utility.addMissingSlash(month) \ + + utility.addMissingSlash(folderName) + + if outputPath is not None: + pathBase = utility.addMissingSlash(outputPath) + + outputFile = month.strip("/").replace("/", "_") + "_" + folderName + ".tsv" + + if outputFilename is not None: + outputFile = outputFilename + + filter = utility.filter() + + filter.setup(filterParams) + + + class hourlyFieldValueHandler: + monthlyFieldValues = set() + + monthlyData = dict() + + def handle(self, sparqlQuery, processed): + if not filter.checkLine(processed): + return + + entriesOne = utility.fetchEntries(processed, metricOne, nosplittingOne) + + for keyTwo in utility.fetchEntries(processed, metricTwo, nosplittingTwo): + if keyTwo not in self.monthlyData: + self.monthlyData[keyTwo] = defaultdict(int) + + for keyOne in entriesOne: + self.monthlyFieldValues.add(keyOne) + self.monthlyData[keyTwo][keyOne] += 1 + + def writeHourlyValues(self): + writeOutMethod(pathBase + outputFile, self.monthlyFieldValues, self.monthlyData, metricTwo + "\\" + metricOne) + + handler = hourlyFieldValueHandler() + + processdata.processMonth(handler, month, monthsFolder, notifications = notifications) + + if writeOut: + if not os.path.exists(pathBase): + os.makedirs(pathBase) + handler.writeHourlyValues() + return (handler.monthlyFieldValues, handler.monthlyData) + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="This script creates tables displaying all values of the " + + "specified first metric and their occurence for the specified second metric count.") + parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, + type=str, help="The folder in which the months directory " + + "are residing.") + parser.add_argument("--ignoreLock", "-i", help="Ignore locked file and execute" + + " anyways", action="store_true") + parser.add_argument("--suppressNotifications", "-s", help = "Suppress notifications from processdata.py.", action = "store_true") + parser.add_argument("--outputPath", "-p", type=str, help="The path where the" + + " output files should be generated.") + parser.add_argument("--outputFilename", "-o", type=str, help="The name of the output file to be generated.") + parser.add_argument("--filter", "-f", default="", type=str, + help="Constraints used to limit the lines used to generate" + + " the output." + + " Default filter is Valid=^VALID$." + + " Enter as =,/ " + + "(e.g. QueryType=wikidataLastModified,ToolName=^USER$)" + + " NOTE: If you use this option you should probably also" + + "set the --outputPath to some value other than the " + + "default.") + parser.add_argument("--nosplittingOne", "-n1", help="Check if you do not want the" + + " script to split entries for metric one at commas and count each part" + + " separately but instead just to sort such entries and " + + "count them as a whole.", action="store_true") + parser.add_argument("--nosplittingTwo", "-n2", help="Check if you do not want the" + + " script to split entries for metric one at commas and count each part" + + " separately but instead just to sort such entries and " + + "count them as a whole.", action="store_true") + parser.add_argument("metricOne", type=str, help="The metric that should be ranked") + parser.add_argument("metricTwo", type=str, help="The metric that should be ranked") + parser.add_argument("month", type=str, + help="The month for which the ranking should be " + + "generated.") + + if (len(sys.argv[1:]) == 0): + parser.print_help() + parser.exit() + + args = parser.parse_args() + + + xyMapping(args.month, args.metricOne, args.metricTwo, monthsFolder = args.monthsFolder, ignoreLock = args.ignoreLock, outputPath = args.outputPath, outputFilename = args.outputFilename, filterParams = args.filter, nosplittingOne = args.nosplittingOne, nosplittingTwo = args.nosplittingTwo, writeOut = True, notifications = not args.suppressNotifications) From 1944abfd756fb3e1a83059b4d3002183d59125ef Mon Sep 17 00:00:00 2001 From: Lindsay Erickson Date: Fri, 27 Feb 2026 10:29:34 -0500 Subject: [PATCH 2/7] python2 to 3 update: write utf-8 encoded strings. --- tools/QueryAnalysis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/QueryAnalysis.py b/tools/QueryAnalysis.py index bca8591..68f85f9 100644 --- a/tools/QueryAnalysis.py +++ b/tools/QueryAnalysis.py @@ -130,12 +130,12 @@ # raw log data file (with added headers) with gzip.open(rawLogDataDirectory + "QueryCnt" + "%02d"%day + ".tsv.gz", "wb") as dayfile: - dayfile.write(header) + dayfile.write(header.encode('utf-8')) for filename in glob.glob(tempDirectory + '*'): with open(filename) as temp: for line in temp: - dayfile.write(line) + dayfile.write(line.encode('utf-8')) shutil.rmtree(tempDirectory) From 560aefe36a1c0766b82e4cc05cb9e21f4a896a24 Mon Sep 17 00:00:00 2001 From: Lindsay Erickson Date: Fri, 27 Feb 2026 13:59:40 -0500 Subject: [PATCH 3/7] remove nonexistent import from QueryAnalysis.py: unifyQueryTypes --- tools/QueryAnalysis.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/QueryAnalysis.py b/tools/QueryAnalysis.py index 68f85f9..6482791 100644 --- a/tools/QueryAnalysis.py +++ b/tools/QueryAnalysis.py @@ -7,7 +7,6 @@ import subprocess import sys import gzip -import unifyQueryTypes from .utility import utility from . import config @@ -116,6 +115,10 @@ hive_call += field + ", " hive_call = hive_call[:-2] + " " + ########## NEEDS UPDATE ############## + # We must query + # https://datahub.wikimedia.org/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,event.wdqs_external_sparql_query,PROD)/Schema?is_lineage_mode=false&schemaFilter= + # hive_call += ' from wmf.wdqs_extract where uri_query<>"" ' \ + 'and year=\'' + str(args.year) + '\' and month=\'' \ + str(months[monthName][0]) + '\' and day=\'' + str(day) + '\'' From 9ecd3612eefd5f95136a196be3cd0871b01b877a Mon Sep 17 00:00:00 2001 From: Lindsay Erickson Date: Fri, 27 Feb 2026 14:16:35 -0500 Subject: [PATCH 4/7] Fix directory issue: code was assumed to be running from inside /tools. --- tools/Anonymize.py | 1 - tools/QueryAnalysis.py | 1 - 2 files changed, 2 deletions(-) diff --git a/tools/Anonymize.py b/tools/Anonymize.py index 731df4a..a6bcbfd 100644 --- a/tools/Anonymize.py +++ b/tools/Anonymize.py @@ -38,7 +38,6 @@ mavenCall.append(mavenArguments) owd = os.getcwd() - os.chdir("..") print("Starting data processing using Anonymizer for " + monthName + ".") diff --git a/tools/QueryAnalysis.py b/tools/QueryAnalysis.py index 6482791..14da986 100644 --- a/tools/QueryAnalysis.py +++ b/tools/QueryAnalysis.py @@ -165,7 +165,6 @@ mavenCall.append(mavenArguments) owd = os.getcwd() - os.chdir("..") print("Starting data processing using QueryAnalysis for " + monthName + ".") From 00c3e73d651e0f691e4a0c0d218ccbff66e8f6c0 Mon Sep 17 00:00:00 2001 From: Lindsay Erickson Date: Fri, 27 Feb 2026 15:37:42 -0500 Subject: [PATCH 5/7] update placeholder paths in config.py --- tools/config.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/config.py b/tools/config.py index 309c853..eeb9e62 100644 --- a/tools/config.py +++ b/tools/config.py @@ -1,8 +1,9 @@ -monthsFolder = "/home/mkroetzsch/querydata/months" -queryReferenceDirectory = "/home/mkroetzsch/querydata/queryTypeReferenceFolder" -fdupesExecutable = "/home/mkroetzsch/querydata/utility/fdupes" -dbLocation = "/home/mkroetzsch/querydata/months/uniqueQueryMap.db" -queryTypeMapDbLocation = "/home/mkroetzsch/querydata/months/queryTypeMap.db" +# Edit these paths to make sense with your setup. +monthsFolder = "/home/your-user/querydata/months" +queryReferenceDirectory = "/home/your-user/querydata/queryTypeReferenceFolder" +fdupesExecutable = "/home/your-user/querydata/utility/fdupes" +dbLocation = "/home/your-user/querydata/months/uniqueQueryMap.db" +queryTypeMapDbLocation = "/home/your-user/querydata/months/queryTypeMap.db" processedFolderName = "processedLogData/" sourceFolderName = "rawLogData/" From d88138a827a6efa83676cff3e54e5474abcf8997 Mon Sep 17 00:00:00 2001 From: Lindsay Erickson Date: Mon, 2 Mar 2026 15:36:23 -0500 Subject: [PATCH 6/7] Update README with python3 directions, and add directions for running Anonymize --- README.md | 25 +++++++++++++++++++------ tools/config.py | 2 +- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 5af7d9b..9296b5b 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ## Getting Started ### Prerequisites -You need to have `Maven`, `OpenJDK 8` and `Python 2` installed. +You need to have `Maven`, `OpenJDK 8` and `Python 3` installed. ### Installing ```shell @@ -11,6 +11,8 @@ $ mvn clean package ``` ### Running the main Java log analyser +(Note that you probably don't want to do this and should just run the Python harness described below, +because it does some necessary work to set arguments and create directories.) ```shell # Processes the example SPARQL log files into exampleMonthsFolder/exampleMonth/processedLogData $ mvn exec:java@QueryAnalysis -Dexec.args="-w exampleMonthsFolder/exampleMonth -logging" @@ -20,17 +22,28 @@ $ mvn exec:java@QueryAnalysis -Dexec.args="--help" ``` -**Important:** In order to not flush the command line with error messages all uncatched Runtime Exceptions are being written to the log files, residing in the logs/ folder, so please have a look at those regularly. The logs are not generated by default, so you should enable them using the `-l` option. +**Important:** In order to not flush the command line with error messages all uncaught Runtime Exceptions are being written to the log files, residing in the logs/ folder, so please have a look at those regularly. The logs are not generated by default, so you should enable them using the `-l` option. -### Running the QueryAnalysis-Script -The QueryAnalysis-Script handles both steps: Extraction using hive and processing using the java application. Extraction using hive only works on the server, but is ignored if the month exists in the months folder. To run the QueryAnalysis-Script locally, you need to provide the local months folder. +### Running the QueryAnalysis script +The QueryAnalysis script handles both steps: extraction using hive and processing using the java application. +Extraction using hive only works on the server, but is ignored if the month exists in the months folder. +To run the QueryAnalysis script locally, you need to provide the local months folder. ```shell -$ python QueryAnalysis exampleMonth -m ../exampleMonthsFolder +# The -l option enables logging +$ python3 -m tools.QueryAnalysis exampleMonth -m ../exampleMonthsFolder -l # You can also specify multiple months in the same directory by separating them using commas -$ python QueryAnalysis exampleMonth,otherMonth -m ../exampleMonthsFolder +$ python3 -m tools.QueryAnalysis exampleMonth,otherMonth -m ../exampleMonthsFolder -l ``` +You will also need to update `tools/config.py` with paths in your own directory. + +## Running the Anonymization script +After you've extracted the raw query data (done as the first step in the `QueryAnalysis` script above), +you can anonymize the extracted queries for the specified month(s). +```shell +$ python3 -m tools.Anonymize exampleMonth -m exampleMonthsFolder/ -l +``` ## License The code in this repository is released under the [Apache 2.0](LICENSE.txt) license. External libraries used may have their own licensing terms. diff --git a/tools/config.py b/tools/config.py index eeb9e62..e7a8449 100644 --- a/tools/config.py +++ b/tools/config.py @@ -1,4 +1,4 @@ -# Edit these paths to make sense with your setup. +# Note to user: you must edit these paths to make sense with your setup. monthsFolder = "/home/your-user/querydata/months" queryReferenceDirectory = "/home/your-user/querydata/queryTypeReferenceFolder" fdupesExecutable = "/home/your-user/querydata/utility/fdupes" From 9329e12c38e8feada765f476310a7bafddd85d77 Mon Sep 17 00:00:00 2001 From: Lindsay Erickson Date: Mon, 2 Mar 2026 16:23:29 -0500 Subject: [PATCH 7/7] Update README to explain current state; add some comments to QueryAnalysis.py --- README.md | 13 +++++++++++++ tools/QueryAnalysis.py | 13 +++++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9296b5b..55aa711 100644 --- a/README.md +++ b/README.md @@ -45,5 +45,18 @@ you can anonymize the extracted queries for the specified month(s). $ python3 -m tools.Anonymize exampleMonth -m exampleMonthsFolder/ -l ``` +## Caveat emptor + +This code won't work in the current form on the server. +The `hive` call in `tools/QueryAnalysis.py` needs to be updated with the current location of the relevant data. +See comments in that file. + +Depending on what is ultimately extracted from logs, downstream changes in the Java code will probably be necessary. +For example, `InputHandlerTSV.java` currently expects a URL-encoded query, but as of March 2026 the logs are +not stored in this form. + +Additionally, it will be important to verify that the call to `StandardizingSPARQLParser.anonymize()` +in `OutputHandlerAnonymizer.java` correctly anonymizes the input queries. + ## License The code in this repository is released under the [Apache 2.0](LICENSE.txt) license. External libraries used may have their own licensing terms. diff --git a/tools/QueryAnalysis.py b/tools/QueryAnalysis.py index 14da986..76d574a 100644 --- a/tools/QueryAnalysis.py +++ b/tools/QueryAnalysis.py @@ -115,10 +115,15 @@ hive_call += field + ", " hive_call = hive_call[:-2] + " " - ########## NEEDS UPDATE ############## - # We must query - # https://datahub.wikimedia.org/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,event.wdqs_external_sparql_query,PROD)/Schema?is_lineage_mode=false&schemaFilter= - # + ########## NEEDS UPDATE! ############## + # This hive call is obsolete and doesn't return any results. + # The wmf.wdqs_extract data store is no longer the right place to look. + # The query needs to be updated based on the current internal data schema. + # + # The only fields that are used in the anonymization part of the code + # are |uri_query| and |ts|; the others are for analysis only. + # + # |uri_query| is assumed to have the form "?query=" hive_call += ' from wmf.wdqs_extract where uri_query<>"" ' \ + 'and year=\'' + str(args.year) + '\' and month=\'' \ + str(months[monthName][0]) + '\' and day=\'' + str(day) + '\''