From 597c632bda3098de37a32761b010928ea122f0ed Mon Sep 17 00:00:00 2001 From: Robert Tonsing Date: Sun, 15 Mar 2026 15:39:25 -0500 Subject: [PATCH 1/5] Merge pglaf-gitpull into gitpull --- README.md | 6 ++++-- gitpull.py | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++--- setup.py | 2 +- 3 files changed, 58 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 3425d53..97f9c91 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,8 @@ for gitpull: - `-h, --help`: Show help message and exit - `-v, --verbose`: Enable verbose output +- `--norepo`: Do not keep Git history +- `--createdir`: Create `target_path` if needed ### Examples for gitpull @@ -108,9 +110,9 @@ or - `git checkout -f origin/main` - Restore state - `git switch main` - - Remove untracked files - force, include directories & ignored (.zip) files + - Remove untracked files - force, include directories, & ignored (.zip) files - `git clean -fdx` -- **The eBook folder will now be a Git repository**: but only for updated eBooks +- **The eBook folder will now be a Git repository, unless `--norepo` was used** - **It does not update the database**: It is assumed that the chron-dopush.sh call to autodelete.py will do that diff --git a/gitpull.py b/gitpull.py index 75bd59e..5f4bc43 100755 --- a/gitpull.py +++ b/gitpull.py @@ -27,7 +27,9 @@ def run_command(cmd, cwd=None, noerror=False): text=True, check=True ) - return result.stdout.strip() + if result.stdout: + result.stdout = result.stdout.strip() + return result.stdout except subprocess.CalledProcessError as e: if not noerror: logger.error(f"Error running command: {' '.join(cmd)}") @@ -175,6 +177,28 @@ def update_folder(repo_url, target_path): return False +def remove_git_history(target_path): + """ + Remove Git history from the target path. + Deletes the .git directory and common Git-related files like .gitignore, .gitattributes, + README.md, and LICENSE.txt if they exist. + It might be cleaner to use "git archive" to export only the files without Git history, + but our server does not support the protocol. Would also need to remove untracked files. + Any existing unchanged files should not be updated. + """ + git_dir = Path(target_path) / ".git" + if git_dir.exists() and git_dir.is_dir(): + shutil.rmtree(git_dir) + logger.info("Git history removed successfully") + else: + logger.info("No Git history found to remove") + files_to_remove = [".gitignore", ".gitattributes", "README.md", "LICENSE.txt"] + for filename in files_to_remove: + file_path = Path(target_path) / filename + if file_path.exists(): + file_path.unlink() + logger.info(f"{filename} removed successfully") + return True def main(): """Main entry point for the script.""" parser = argparse.ArgumentParser( @@ -194,6 +218,16 @@ def main(): action="store_true", help="Enable verbose output" ) + parser.add_argument( + "--norepo", + action="store_true", + help="Do not keep Git history" + ) + parser.add_argument( + "--createdirs", + action="store_true", + help="Create target directories if they don't exist" + ) args = parser.parse_args() @@ -204,8 +238,18 @@ def main(): # Check if target exists and is a directory target_path = Path(args.target_path).resolve() if not target_path.exists() or not target_path.is_dir(): - logger.error(f"{args.target_path} does not exist or is not a directory") - sys.exit(1) + if args.createdirs: + logger.info(f"Creating target directory: {target_path}") + try: + target_path.mkdir(parents=True, exist_ok=True) + except Exception as e: + logger.error(f"Failed to create target directory: {e}") + print(f"Failed: unable to create target directory {target_path}, see log.") + sys.exit(1) + else: + logger.error(f"{args.target_path} does not exist or is not a directory") + print(f"Failed: {args.target_path} does not exist or is not a directory") + sys.exit(1) # Update the directory origin = f"https://r.pglaf.org/git/{args.ebook_number}.git/" @@ -215,7 +259,13 @@ def main(): logger.info(f"Pulling from {origin} to {destination}") success = update_folder(origin, destination) + if args.norepo and success: + success = remove_git_history(destination) + if success: + print(f"Success: eBook {args.ebook_number} copied to {destination}.") + else: + print(f"Failed: unable to copy eBook {args.ebook_number}, see log.") sys.exit(0 if success else 1) diff --git a/setup.py b/setup.py index 0357e4b..4bc52e3 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name="gitpull", - version="0.2.0", + version="0.3.0", author="Project Gutenberg", description="Update a folder with the latest files from a Git repository", long_description=long_description, From 9fe36f85e515804220211a9cf2dfdd36c354d634 Mon Sep 17 00:00:00 2001 From: Robert Tonsing Date: Mon, 16 Mar 2026 16:08:59 -0500 Subject: [PATCH 2/5] add version --- gitpull.py | 18 +++++++++++++++++- puller.py | 2 -- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/gitpull.py b/gitpull.py index 5f4bc43..bb2f965 100755 --- a/gitpull.py +++ b/gitpull.py @@ -12,7 +12,10 @@ import sys import logging from pathlib import Path +import shutil +VERSION = "2026.03.16" +UPSTREAM_REPO_DIR = os.getenv('UPSTREAM_REPO_DIR') or 'https://github.com/gutenbergbooks/' # Configure logging logging.basicConfig(filename='gitpull.log', level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) @@ -199,12 +202,19 @@ def remove_git_history(target_path): file_path.unlink() logger.info(f"{filename} removed successfully") return True + + def main(): """Main entry point for the script.""" parser = argparse.ArgumentParser( description="Update an eBook folder with the latest files from the Git repository", epilog="Example: %(prog)s 12345 /path/to/target" ) + parser.add_argument( + "--version", + action="store_true", + help="Show version information" + ) parser.add_argument( "ebook_number", help="Number of the eBook Git repository to clone/pull from" @@ -231,6 +241,9 @@ def main(): args = parser.parse_args() + if args.version: + print(f"gitpull version {VERSION}") + sys.exit(0) # Set logging level based on verbosity if args.verbose: logger.setLevel(logging.DEBUG) @@ -239,6 +252,7 @@ def main(): target_path = Path(args.target_path).resolve() if not target_path.exists() or not target_path.is_dir(): if args.createdirs: + # Create the target directory if it doesn't exist logger.info(f"Creating target directory: {target_path}") try: target_path.mkdir(parents=True, exist_ok=True) @@ -252,13 +266,15 @@ def main(): sys.exit(1) # Update the directory - origin = f"https://r.pglaf.org/git/{args.ebook_number}.git/" + origin = f"{UPSTREAM_REPO_DIR}{args.ebook_number}.git/" # destination is a directory named with the ebook number under the target path destination = f"{args.target_path}/{args.ebook_number}" logger.info(f"Pulling from {origin} to {destination}") success = update_folder(origin, destination) + # Remove Git history if not needed, but only if the update was successful to avoid + # deleting existing files on failure if args.norepo and success: success = remove_git_history(destination) diff --git a/puller.py b/puller.py index 04bbf54..1c05cf7 100644 --- a/puller.py +++ b/puller.py @@ -48,8 +48,6 @@ def scan_dopull_log(): """ Scan the dopull log directory for new files. """ - current_user_id = os.getuid() - for filename in sorted(os.listdir(DOPULL_LOG_DIR)): mode = os.stat(os.path.join(DOPULL_LOG_DIR, filename))[stat.ST_MODE] # skip directories JIC From d30fb2716efc33f271505e021ff101c8108dfb53 Mon Sep 17 00:00:00 2001 From: Robert Tonsing Date: Mon, 16 Mar 2026 16:09:26 -0500 Subject: [PATCH 3/5] add updatehosts.py --- updatehosts.py | 165 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 updatehosts.py diff --git a/updatehosts.py b/updatehosts.py new file mode 100644 index 0000000..7a7ee46 --- /dev/null +++ b/updatehosts.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 +""" +from gutenbergtools/pglaf-gitpull: Update a folder with the latest files from a Git repository + +This tool clones or pulls the latest changes from a Git repository into a +specified target folder. +""" +import argparse +import os +import subprocess +import logging + +# Configure logging +logging.basicConfig(filename='updatehosts.log', level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +PRIVATE = os.getenv('PRIVATE') or '' +# These are where .zip.trig files go on ibiblio : +DOPULL_LOG_DIR = os.path.join(PRIVATE, 'logs', 'dopull') +IBIBLIO_BIN = os.getenv('IBIBLIO_BIN') or '' +MIRROR_BIN = os.getenv('MIRROR_BIN') or '' + +ibiblio = "gutenberg.login.ibiblio.org" +mirrors = ["inferno.pglaf.org", + "aleph.pglaf.org", + "readingroo.ms"] + +def load_env_file(env_file='.env'): + """ + Load environment variables from a .env file. + Assumes the file is in the current directory and contains key=value pairs. + Skips lines starting with # (comments) and empty lines. + """ + if not os.path.exists(env_file): + logger.warning(f".env file not found: {env_file}") + return + with open(env_file, 'r') as f: + for line in f: + line = line.strip() + if not line or line.startswith('#'): + continue + if '=' in line: + key, value = line.split('=', 1) + os.environ[key.strip()] = value.strip() + logger.info(f"Loaded env var: {key.strip()}") + +def run_python_script_via_ssh(host, script_path, script_args=None, timeout=60): + """Run a Python script on a remote server via SSH.""" + if script_args is None: + script_args = [] + + remote_command = f"python3 {script_path}" + try: + logger.info(f"[START] Running Python script on {host}: {remote_command} {' '.join(script_args)}") + output = run_ssh_command(host, remote_command, arguments=script_args, timeout=timeout) + logger.info(f"[SUCCESS] Output from {host}: {output}") + return output + except Exception as e: + logger.error(f"[ERROR] Failed to run Python script on {host}: {str(e)}") + raise + +def run_ssh_command(host, command, arguments=None, timeout=60): + """Run a shell command on a remote host via SSH with optional arguments.""" + if arguments is None: + arguments = [] + + # Append arguments to the command + full_command = f"{command} {' '.join(arguments)}" + try: + logger.info(f"[START] Running command on {host}: {full_command}") + result = subprocess.run( + ["ssh", host, full_command], + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=timeout + ) + logger.info(f"[SUCCESS] Command output from {host}: {result.stdout}") + if result.stderr: + logger.warning(f"[WARNING] Command stderr from {host}: {result.stderr}") + return result.stdout + except subprocess.TimeoutExpired: + logger.error(f"[TIMEOUT] Command timed out after {timeout} seconds on {host}: {full_command}") + raise + except subprocess.CalledProcessError as e: + logger.error(f"[FAILURE] Command failed on {host}: {e.stderr}") + raise + except Exception as e: + logger.error(f"[ERROR] Unexpected error while running command on {host}: {str(e)}") + raise + +def get_ebook_path(number): + """Get PG directory path: 12345 --> 1/2/3/4/""" + outdir = '/'.join(number) + '/' + + # Ditch the last digit to make the target subdirectory + if len(outdir) == 2: # Special case: Single digit filenames will prefix with '0/' + outdir = '0/' + else: + where = outdir.rfind('/') + if where != -1: + outdir = outdir[:where - 1] # It's always 1 digit + return outdir + +def update_gitpull_to_hosts(): + """ + Update the gitpull script on all hosts. + Assumes the source script is named 'gitpull.py' and is located in the current directory. + """ + for host in mirrors + [ibiblio]: + logger.info(f"Updating gitpull.py script on {host}...") + if not os.path.exists('gitpull.py'): + print("gitpull.py script not found in the current directory.") + return 1 + if not IBIBLIO_BIN or not MIRROR_BIN: + print("IBIBLIO_BIN or MIRROR_BIN environment variable not set.") + return 1 + try: + if host == ibiblio: + result = run_ssh_command(host, "scp", ["gitpull.py", f"{host}:{IBIBLIO_BIN}"]) + else: + result = run_ssh_command(host, "scp", ["gitpull.py", f"{host}:{MIRROR_BIN}"]) + print(f"Successfully updated gitpull script on {host}") + except Exception as e: + result = f"Failed to update gitpull script on {host}: {str(e)}" + logger.error(result) + return 1 + + print(f"Finished updating gitpull.py on {host}, result = {result}\n") + return 0 + +def main(): + """Main entry point for the script.""" + load_env_file() # Load .env variables at the start + parser = argparse.ArgumentParser( + description="Update an eBook directory on the mirrors with the latest files from the Git repository", + epilog="Example: %(prog)s 12345" + ) + parser.add_argument( + "ebook_number", + help="Number of the eBook Git repository to pull from" + ) + args = parser.parse_args() + + # Get the destination path for the eBook number + destination = get_ebook_path(args.ebook_number) + print(f"{args.ebook_number} goes to {destination}\n") + destination = "~/ftp/" + destination + for host in mirrors: + print("Copying to " + host + "...") + # Call gitpull.py on the host, creating the target directory if it doesn't exist, no history + sargs = ["--norepo", "--createdir", f"{args.ebook_number}", f"{destination}"] + run_python_script_via_ssh(host, f"{MIRROR_BIN}/gitpull.py", sargs) + print("Success!\n") + + # ibiblio is a special case, it needs to trigger other actions after the pull, + # so we just trigger the pull there and let it do the rest + print(f"Trigger processing of #{args.ebook_number} on ibiblio...") + run_ssh_command(ibiblio, "touch", [f"{DOPULL_LOG_DIR}{args.ebook_number}.zip.trig"]) + print("Success!\n") + + +if __name__ == "__main__": + main() From f734c4516d8702ee8df9577db8e2e21d589fa301 Mon Sep 17 00:00:00 2001 From: Robert Tonsing Date: Tue, 17 Mar 2026 15:14:41 -0500 Subject: [PATCH 4/5] tweak repo site name --- gitpull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gitpull.py b/gitpull.py index bb2f965..4c56e2b 100755 --- a/gitpull.py +++ b/gitpull.py @@ -266,7 +266,7 @@ def main(): sys.exit(1) # Update the directory - origin = f"{UPSTREAM_REPO_DIR}{args.ebook_number}.git/" + origin = f"{UPSTREAM_REPO_DIR}/{args.ebook_number}.git/" # destination is a directory named with the ebook number under the target path destination = f"{args.target_path}/{args.ebook_number}" From 660776718a05cb4f2156bf6d7318cccf5d5a13d5 Mon Sep 17 00:00:00 2001 From: Robert Tonsing Date: Tue, 17 Mar 2026 15:53:50 -0500 Subject: [PATCH 5/5] add environment checking --- gitpull.py | 12 +++++++++--- updatehosts.py | 10 +++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/gitpull.py b/gitpull.py index 4c56e2b..7f93da2 100755 --- a/gitpull.py +++ b/gitpull.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 """ -from gutenbergtools/pglaf-gitpull: Update a folder with the latest files from a Git repository +from gutenbergtools/gitpull: Update a folder with the latest files from a Git repository -This tool clones or pulls the latest changes from a Git repository into a +This tool clones or pulls the latest changes from a PG Git eBook repository into a specified target folder. """ @@ -15,7 +15,8 @@ import shutil VERSION = "2026.03.16" -UPSTREAM_REPO_DIR = os.getenv('UPSTREAM_REPO_DIR') or 'https://github.com/gutenbergbooks/' +UPSTREAM_REPO_DIR = os.getenv('UPSTREAM_REPO_DIR') or '' + # Configure logging logging.basicConfig(filename='gitpull.log', level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) @@ -248,6 +249,11 @@ def main(): if args.verbose: logger.setLevel(logging.DEBUG) + if not UPSTREAM_REPO_DIR: + logger.error("UPSTREAM_REPO_DIR environment variable is not set") + print("Failed: UPSTREAM_REPO_DIR environment variable is not set.") + sys.exit(1) + # Check if target exists and is a directory target_path = Path(args.target_path).resolve() if not target_path.exists() or not target_path.is_dir(): diff --git a/updatehosts.py b/updatehosts.py index 7a7ee46..b0950c4 100644 --- a/updatehosts.py +++ b/updatehosts.py @@ -9,6 +9,7 @@ import os import subprocess import logging +import sys # Configure logging logging.basicConfig(filename='updatehosts.log', level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") @@ -33,7 +34,7 @@ def load_env_file(env_file='.env'): """ if not os.path.exists(env_file): logger.warning(f".env file not found: {env_file}") - return + return False with open(env_file, 'r') as f: for line in f: line = line.strip() @@ -43,6 +44,7 @@ def load_env_file(env_file='.env'): key, value = line.split('=', 1) os.environ[key.strip()] = value.strip() logger.info(f"Loaded env var: {key.strip()}") + return True def run_python_script_via_ssh(host, script_path, script_args=None, timeout=60): """Run a Python script on a remote server via SSH.""" @@ -132,7 +134,9 @@ def update_gitpull_to_hosts(): def main(): """Main entry point for the script.""" - load_env_file() # Load .env variables at the start + if not load_env_file(): # Load .env variables at the start + print("Failed to load environment variables.") + sys.exit(1) parser = argparse.ArgumentParser( description="Update an eBook directory on the mirrors with the latest files from the Git repository", epilog="Example: %(prog)s 12345" @@ -159,7 +163,7 @@ def main(): print(f"Trigger processing of #{args.ebook_number} on ibiblio...") run_ssh_command(ibiblio, "touch", [f"{DOPULL_LOG_DIR}{args.ebook_number}.zip.trig"]) print("Success!\n") - + sys.exit(0) if __name__ == "__main__": main()