Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ for gitpull:

- `-h, --help`: Show help message and exit
- `-v, --verbose`: Enable verbose output
- `--norepo`: Do not keep Git history
- `--createdir`: Create `target_path` if needed

### Examples for gitpull

Expand Down Expand Up @@ -108,9 +110,9 @@ or
- `git checkout -f origin/main`
- Restore state
- `git switch main`
- Remove untracked files - force, include directories & ignored (.zip) files
- Remove untracked files - force, include directories, & ignored (.zip) files
- `git clean -fdx`
- **The eBook folder will now be a Git repository**: but only for updated eBooks
- **The eBook folder will now be a Git repository, unless `--norepo` was used**
- **It does not update the database**: It is assumed that the chron-dopush.sh call to autodelete.py will do that


Expand Down
84 changes: 78 additions & 6 deletions gitpull.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/env python3
"""
from gutenbergtools/pglaf-gitpull: Update a folder with the latest files from a Git repository
from gutenbergtools/gitpull: Update a folder with the latest files from a Git repository

This tool clones or pulls the latest changes from a Git repository into a
This tool clones or pulls the latest changes from a PG Git eBook repository into a
specified target folder.
"""

Expand All @@ -12,6 +12,10 @@
import sys
import logging
from pathlib import Path
import shutil

VERSION = "2026.03.16"
UPSTREAM_REPO_DIR = os.getenv('UPSTREAM_REPO_DIR') or ''

# Configure logging
logging.basicConfig(filename='gitpull.log', level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
Expand All @@ -27,7 +31,9 @@ def run_command(cmd, cwd=None, noerror=False):
text=True,
check=True
)
return result.stdout.strip()
if result.stdout:
result.stdout = result.stdout.strip()
return result.stdout
except subprocess.CalledProcessError as e:
if not noerror:
logger.error(f"Error running command: {' '.join(cmd)}")
Expand Down Expand Up @@ -175,12 +181,41 @@ def update_folder(repo_url, target_path):
return False


def remove_git_history(target_path):
"""
Remove Git history from the target path.
Deletes the .git directory and common Git-related files like .gitignore, .gitattributes,
README.md, and LICENSE.txt if they exist.
It might be cleaner to use "git archive" to export only the files without Git history,
but our server does not support the protocol. Would also need to remove untracked files.
Any existing unchanged files should not be updated.
"""
git_dir = Path(target_path) / ".git"
if git_dir.exists() and git_dir.is_dir():
shutil.rmtree(git_dir)
logger.info("Git history removed successfully")
else:
logger.info("No Git history found to remove")
files_to_remove = [".gitignore", ".gitattributes", "README.md", "LICENSE.txt"]
for filename in files_to_remove:
file_path = Path(target_path) / filename
if file_path.exists():
file_path.unlink()
logger.info(f"{filename} removed successfully")
return True


def main():
"""Main entry point for the script."""
parser = argparse.ArgumentParser(
description="Update an eBook folder with the latest files from the Git repository",
epilog="Example: %(prog)s 12345 /path/to/target"
)
parser.add_argument(
"--version",
action="store_true",
help="Show version information"
)
parser.add_argument(
"ebook_number",
help="Number of the eBook Git repository to clone/pull from"
Expand All @@ -194,28 +229,65 @@ def main():
action="store_true",
help="Enable verbose output"
)
parser.add_argument(
"--norepo",
action="store_true",
help="Do not keep Git history"
)
parser.add_argument(
"--createdirs",
action="store_true",
help="Create target directories if they don't exist"
)

args = parser.parse_args()

if args.version:
print(f"gitpull version {VERSION}")
sys.exit(0)
# Set logging level based on verbosity
if args.verbose:
logger.setLevel(logging.DEBUG)

if not UPSTREAM_REPO_DIR:
logger.error("UPSTREAM_REPO_DIR environment variable is not set")
print("Failed: UPSTREAM_REPO_DIR environment variable is not set.")
sys.exit(1)

# Check if target exists and is a directory
target_path = Path(args.target_path).resolve()
if not target_path.exists() or not target_path.is_dir():
logger.error(f"{args.target_path} does not exist or is not a directory")
sys.exit(1)
if args.createdirs:
# Create the target directory if it doesn't exist
logger.info(f"Creating target directory: {target_path}")
try:
target_path.mkdir(parents=True, exist_ok=True)
except Exception as e:
logger.error(f"Failed to create target directory: {e}")
print(f"Failed: unable to create target directory {target_path}, see log.")
sys.exit(1)
else:
logger.error(f"{args.target_path} does not exist or is not a directory")
print(f"Failed: {args.target_path} does not exist or is not a directory")
sys.exit(1)

# Update the directory
origin = f"https://r.pglaf.org/git/{args.ebook_number}.git/"
origin = f"{UPSTREAM_REPO_DIR}/{args.ebook_number}.git/"

# destination is a directory named with the ebook number under the target path
destination = f"{args.target_path}/{args.ebook_number}"
logger.info(f"Pulling from {origin} to {destination}")

success = update_folder(origin, destination)
# Remove Git history if not needed, but only if the update was successful to avoid
# deleting existing files on failure
if args.norepo and success:
success = remove_git_history(destination)

if success:
print(f"Success: eBook {args.ebook_number} copied to {destination}.")
else:
print(f"Failed: unable to copy eBook {args.ebook_number}, see log.")
sys.exit(0 if success else 1)


Expand Down
2 changes: 0 additions & 2 deletions puller.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,6 @@ def scan_dopull_log():
"""
Scan the dopull log directory for new files.
"""
current_user_id = os.getuid()

for filename in sorted(os.listdir(DOPULL_LOG_DIR)):
mode = os.stat(os.path.join(DOPULL_LOG_DIR, filename))[stat.ST_MODE]
# skip directories JIC
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setup(
name="gitpull",
version="0.2.0",
version="0.3.0",
author="Project Gutenberg",
description="Update a folder with the latest files from a Git repository",
long_description=long_description,
Expand Down
169 changes: 169 additions & 0 deletions updatehosts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
#!/usr/bin/env python3
"""
from gutenbergtools/pglaf-gitpull: Update a folder with the latest files from a Git repository

This tool clones or pulls the latest changes from a Git repository into a
specified target folder.
"""
import argparse
import os
import subprocess
import logging
import sys

# Configure logging
logging.basicConfig(filename='updatehosts.log', level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

PRIVATE = os.getenv('PRIVATE') or ''
# These are where .zip.trig files go on ibiblio :
DOPULL_LOG_DIR = os.path.join(PRIVATE, 'logs', 'dopull')
IBIBLIO_BIN = os.getenv('IBIBLIO_BIN') or ''
MIRROR_BIN = os.getenv('MIRROR_BIN') or ''

ibiblio = "gutenberg.login.ibiblio.org"
mirrors = ["inferno.pglaf.org",
"aleph.pglaf.org",
"readingroo.ms"]

def load_env_file(env_file='.env'):
"""
Load environment variables from a .env file.
Assumes the file is in the current directory and contains key=value pairs.
Skips lines starting with # (comments) and empty lines.
"""
if not os.path.exists(env_file):
logger.warning(f".env file not found: {env_file}")
return False
with open(env_file, 'r') as f:
for line in f:
line = line.strip()
if not line or line.startswith('#'):
continue
if '=' in line:
key, value = line.split('=', 1)
os.environ[key.strip()] = value.strip()
logger.info(f"Loaded env var: {key.strip()}")
return True

def run_python_script_via_ssh(host, script_path, script_args=None, timeout=60):
"""Run a Python script on a remote server via SSH."""
if script_args is None:
script_args = []

remote_command = f"python3 {script_path}"
try:
logger.info(f"[START] Running Python script on {host}: {remote_command} {' '.join(script_args)}")
output = run_ssh_command(host, remote_command, arguments=script_args, timeout=timeout)
logger.info(f"[SUCCESS] Output from {host}: {output}")
return output
except Exception as e:
logger.error(f"[ERROR] Failed to run Python script on {host}: {str(e)}")
raise

def run_ssh_command(host, command, arguments=None, timeout=60):
"""Run a shell command on a remote host via SSH with optional arguments."""
if arguments is None:
arguments = []

# Append arguments to the command
full_command = f"{command} {' '.join(arguments)}"
try:
logger.info(f"[START] Running command on {host}: {full_command}")
result = subprocess.run(
["ssh", host, full_command],
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=timeout
)
logger.info(f"[SUCCESS] Command output from {host}: {result.stdout}")
if result.stderr:
logger.warning(f"[WARNING] Command stderr from {host}: {result.stderr}")
return result.stdout
except subprocess.TimeoutExpired:
logger.error(f"[TIMEOUT] Command timed out after {timeout} seconds on {host}: {full_command}")
raise
except subprocess.CalledProcessError as e:
logger.error(f"[FAILURE] Command failed on {host}: {e.stderr}")
raise
except Exception as e:
logger.error(f"[ERROR] Unexpected error while running command on {host}: {str(e)}")
raise

def get_ebook_path(number):
"""Get PG directory path: 12345 --> 1/2/3/4/"""
outdir = '/'.join(number) + '/'

# Ditch the last digit to make the target subdirectory
if len(outdir) == 2: # Special case: Single digit filenames will prefix with '0/'
outdir = '0/'
else:
where = outdir.rfind('/')
if where != -1:
outdir = outdir[:where - 1] # It's always 1 digit
return outdir

def update_gitpull_to_hosts():
"""
Update the gitpull script on all hosts.
Assumes the source script is named 'gitpull.py' and is located in the current directory.
"""
for host in mirrors + [ibiblio]:
logger.info(f"Updating gitpull.py script on {host}...")
if not os.path.exists('gitpull.py'):
print("gitpull.py script not found in the current directory.")
return 1
if not IBIBLIO_BIN or not MIRROR_BIN:
print("IBIBLIO_BIN or MIRROR_BIN environment variable not set.")
return 1
try:
if host == ibiblio:
result = run_ssh_command(host, "scp", ["gitpull.py", f"{host}:{IBIBLIO_BIN}"])
else:
result = run_ssh_command(host, "scp", ["gitpull.py", f"{host}:{MIRROR_BIN}"])
print(f"Successfully updated gitpull script on {host}")
except Exception as e:
result = f"Failed to update gitpull script on {host}: {str(e)}"
logger.error(result)
return 1

print(f"Finished updating gitpull.py on {host}, result = {result}\n")
return 0

def main():
"""Main entry point for the script."""
if not load_env_file(): # Load .env variables at the start
print("Failed to load environment variables.")
sys.exit(1)
parser = argparse.ArgumentParser(
description="Update an eBook directory on the mirrors with the latest files from the Git repository",
epilog="Example: %(prog)s 12345"
)
parser.add_argument(
"ebook_number",
help="Number of the eBook Git repository to pull from"
)
args = parser.parse_args()

# Get the destination path for the eBook number
destination = get_ebook_path(args.ebook_number)
print(f"{args.ebook_number} goes to {destination}\n")
destination = "~/ftp/" + destination
for host in mirrors:
print("Copying to " + host + "...")
# Call gitpull.py on the host, creating the target directory if it doesn't exist, no history
sargs = ["--norepo", "--createdir", f"{args.ebook_number}", f"{destination}"]
run_python_script_via_ssh(host, f"{MIRROR_BIN}/gitpull.py", sargs)
print("Success!\n")

# ibiblio is a special case, it needs to trigger other actions after the pull,
# so we just trigger the pull there and let it do the rest
print(f"Trigger processing of #{args.ebook_number} on ibiblio...")
run_ssh_command(ibiblio, "touch", [f"{DOPULL_LOG_DIR}{args.ebook_number}.zip.trig"])
print("Success!\n")
sys.exit(0)

if __name__ == "__main__":
main()