Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,174 changes: 1,292 additions & 882 deletions Include/internal/pycore_uop_ids.h

Large diffs are not rendered by default.

5,315 changes: 5,226 additions & 89 deletions Include/internal/pycore_uop_metadata.h

Large diffs are not rendered by default.

41,352 changes: 28,184 additions & 13,168 deletions Python/executor_cases.c.h

Large diffs are not rendered by default.

21 changes: 11 additions & 10 deletions Tools/cases_generator/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

from parser import Stmt, SimpleStmt, BlockStmt, IfStmt, WhileStmt, ForStmt, MacroIfStmt

MAX_CACHED_REGISTER = 3
MAX_GENERATED_CACHED_REGISTER = 5
MAX_CACHED_REGISTER = 3 # Platform-specific; controls compile-time case pruning

@dataclass
class EscapingCall:
Expand Down Expand Up @@ -1369,13 +1370,13 @@ def is_large(uop: Uop) -> bool:

def get_uop_cache_depths(uop: Uop) -> Iterator[tuple[int, int, int]]:
if uop.name == "_SPILL_OR_RELOAD":
for inputs in range(MAX_CACHED_REGISTER+1):
for outputs in range(MAX_CACHED_REGISTER+1):
for inputs in range(MAX_GENERATED_CACHED_REGISTER + 1):
for outputs in range(MAX_GENERATED_CACHED_REGISTER + 1):
if inputs != outputs:
yield inputs, outputs, inputs
return
if uop.name in ("_DEOPT", "_HANDLE_PENDING_AND_DEOPT", "_EXIT_TRACE", "_DYNAMIC_EXIT"):
for i in range(MAX_CACHED_REGISTER+1):
for i in range(MAX_GENERATED_CACHED_REGISTER + 1):
yield i, 0, 0
return
if uop.name in ("_START_EXECUTOR", "_JUMP_TO_TOP", "_COLD_EXIT"):
Expand All @@ -1397,20 +1398,20 @@ def get_uop_cache_depths(uop: Uop) -> Iterator[tuple[int, int, int]]:
has_array = True
break
ideal_outputs += 1
if ideal_inputs > MAX_CACHED_REGISTER:
ideal_inputs = MAX_CACHED_REGISTER
if ideal_outputs > MAX_CACHED_REGISTER:
ideal_outputs = MAX_CACHED_REGISTER
if ideal_inputs > MAX_GENERATED_CACHED_REGISTER:
ideal_inputs = MAX_GENERATED_CACHED_REGISTER
if ideal_outputs > MAX_GENERATED_CACHED_REGISTER:
ideal_outputs = MAX_GENERATED_CACHED_REGISTER
at_end = uop.properties.sync_sp or uop.properties.side_exit_at_end
exit_depth = ideal_outputs if at_end else ideal_inputs
if uop.properties.escapes or uop.properties.sync_sp or has_array or is_large(uop):
yield ideal_inputs, ideal_outputs, exit_depth
return
for inputs in range(MAX_CACHED_REGISTER + 1):
for inputs in range(MAX_GENERATED_CACHED_REGISTER + 1):
outputs = ideal_outputs - ideal_inputs + inputs
if outputs < ideal_outputs:
outputs = ideal_outputs
elif outputs > MAX_CACHED_REGISTER:
elif outputs > MAX_GENERATED_CACHED_REGISTER:
continue
yield inputs, outputs, outputs if at_end else inputs

Expand Down
16 changes: 13 additions & 3 deletions Tools/cases_generator/tier2_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
analysis_error,
get_uop_cache_depths,
is_large,
MAX_CACHED_REGISTER,
MAX_GENERATED_CACHED_REGISTER,
)

from generators_common import (
Expand Down Expand Up @@ -202,8 +202,10 @@ def cache_items(self, stack: Stack, cached_items: int, zero_regs: bool) -> None:
# replace this with a "clobber" to tell
# the compiler that these values are unused
# without having to emit any code.
for i in range(cached_items, MAX_CACHED_REGISTER):
for i in range(cached_items, MAX_GENERATED_CACHED_REGISTER):
self.out.emit(f"#if MAX_CACHED_REGISTER >= {i + 1}\n")
self.out.emit(f"_tos_cache{i} = PyStackRef_ZERO_BITS;\n")
self.out.emit("#endif\n")
self.emit(f"SET_CURRENT_CACHED_VALUES({cached_items});\n")


Expand Down Expand Up @@ -277,7 +279,12 @@ def generate_tier2(
continue
for inputs, outputs, exit_depth in get_uop_cache_depths(uop):
emitter = Tier2Emitter(out, analysis.labels, exit_depth)
out.emit(f"case {uop.name}_r{inputs}{outputs}: {{\n")
opname = f"{uop.name}_r{inputs}{outputs}"
needed_cached_registers = max(inputs, outputs)
if needed_cached_registers:
out.start_line()
out.out.write(f"#if MAX_CACHED_REGISTER >= {needed_cached_registers}\n")
out.emit(f"case {opname}: {{\n")
out.emit(f"CHECK_CURRENT_CACHED_VALUES({inputs});\n")
out.emit("assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());\n")
declare_variables(uop, out)
Expand All @@ -292,6 +299,9 @@ def generate_tier2(
out.emit("break;\n")
out.start_line()
out.emit("}")
if needed_cached_registers:
out.start_line()
out.out.write(f"#endif\n")
out.emit("\n\n")
out.emit("\n")
outfile.write("#undef TIER_TWO\n")
Expand Down
17 changes: 16 additions & 1 deletion Tools/cases_generator/uop_id_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
"""

import argparse
from collections import defaultdict

from analyzer import (
Analysis,
analyze_files,
get_uop_cache_depths,
MAX_CACHED_REGISTER,
)
from generators_common import (
DEFAULT_INPUT,
Expand Down Expand Up @@ -48,15 +50,28 @@ def generate_uop_ids(
next_id += 1

out.emit(f"#define MAX_UOP_ID {next_id-1}\n")
out.emit(f"#define MAX_CACHED_REGISTER {MAX_CACHED_REGISTER}\n")
register_groups: dict[int, list[tuple[str, int, int]]] = defaultdict(list)
for name, uop in sorted(uops):
if uop.properties.tier == 1:
continue
if uop.properties.records_value:
continue
for inputs, outputs, _ in sorted(get_uop_cache_depths(uop)):
register_groups[max(inputs, outputs)].append((name, inputs, outputs))
first_group = True
for level in sorted(register_groups):
if level > 0:
out.emit(f"#if MAX_CACHED_REGISTER >= {level}\n")
for name, inputs, outputs in register_groups[level]:
out.emit(f"#define {name}_r{inputs}{outputs} {next_id}\n")
next_id += 1
out.emit(f"#define MAX_UOP_REGS_ID {next_id-1}\n")
if not first_group:
out.emit(f"#undef MAX_UOP_REGS_ID\n")
out.emit(f"#define MAX_UOP_REGS_ID {next_id-1}\n")
first_group = False
if level > 0:
out.emit(f"#endif\n")


arg_parser = argparse.ArgumentParser(
Expand Down
13 changes: 12 additions & 1 deletion Tools/cases_generator/uop_metadata_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ def uop_cache_info(uop: Uop) -> list[str] | None:
high = -1
defined = [ False ] * 4
for inputs, outputs, exit_depth in get_uop_cache_depths(uop):
if max(inputs, outputs) > MAX_CACHED_REGISTER:
continue
entries[inputs] = f"{{ {outputs}, {exit_depth}, {uop.name}_r{inputs}{outputs} }},\n"
if inputs < low:
low = inputs
Expand All @@ -63,7 +65,6 @@ def uop_cache_info(uop: Uop) -> list[str] | None:


def generate_names_and_flags(analysis: Analysis, out: CWriter) -> None:
out.emit(f"#define MAX_CACHED_REGISTER {MAX_CACHED_REGISTER}\n")
out.emit("extern const uint32_t _PyUop_Flags[MAX_UOP_ID+1];\n")
out.emit("typedef struct _rep_range { uint8_t start; uint8_t stop; } ReplicationRange;\n")
out.emit("extern const ReplicationRange _PyUop_Replication[MAX_UOP_ID+1];\n")
Expand Down Expand Up @@ -100,7 +101,12 @@ def generate_names_and_flags(analysis: Analysis, out: CWriter) -> None:
for uop in analysis.uops.values():
if uop.is_viable() and uop.properties.tier != 1 and not uop.is_super() and not uop.properties.records_value:
for inputs, outputs, _ in get_uop_cache_depths(uop):
needed = max(inputs, outputs)
if needed > 0:
out.emit(f"#if MAX_CACHED_REGISTER >= {needed}\n")
out.emit(f"[{uop.name}_r{inputs}{outputs}] = {uop.name},\n")
if needed > 0:
out.emit(f"#endif\n")
out.emit("};\n\n")
out.emit(f"const uint16_t _PyUop_SpillsAndReloads[{MAX_CACHED_REGISTER+1}][{MAX_CACHED_REGISTER+1}] = {{\n")
for i in range(MAX_CACHED_REGISTER+1):
Expand All @@ -114,7 +120,12 @@ def generate_names_and_flags(analysis: Analysis, out: CWriter) -> None:
out.emit(f'[{uop.name}] = "{uop.name}",\n')
if not uop.properties.records_value:
for inputs, outputs, _ in get_uop_cache_depths(uop):
needed = max(inputs, outputs)
if needed > 0:
out.emit(f"#if MAX_CACHED_REGISTER >= {needed}\n")
out.emit(f'[{uop.name}_r{inputs}{outputs}] = "{uop.name}_r{inputs}{outputs}",\n')
if needed > 0:
out.emit(f"#endif\n")
out.emit("};\n")
out.emit("int _PyUop_num_popped(int opcode, int oparg)\n{\n")
out.emit("switch(opcode) {\n")
Expand Down
9 changes: 8 additions & 1 deletion Tools/jit/_targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@

ASYNCIO_RUNNER = asyncio.Runner()

# TODO: Read this from Tools/cases_generator/analyzer.py instead of hardcoding.
_MAX_CACHED_REGISTER = 3

_S = typing.TypeVar("_S", _schema.COFFSection, _schema.ELFSection, _schema.MachOSection)
_R = typing.TypeVar(
"_R", _schema.COFFRelocation, _schema.ELFRelocation, _schema.MachORelocation
Expand Down Expand Up @@ -197,7 +200,8 @@ async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]:
generated_cases = PYTHON_EXECUTOR_CASES_C_H.read_text()
cases_and_opnames = sorted(
re.findall(
r"\n {8}(case (\w+): \{\n.*?\n {8}\})", generated_cases, flags=re.DOTALL
r"\n((?:#if [^\n]*\n)? {8}case (\w+): \{\n.*?\n {8}\}(?:\n#endif)?)",
generated_cases, flags=re.DOTALL,
)
)
tasks = []
Expand All @@ -208,6 +212,9 @@ async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]:
tasks.append(group.create_task(coro, name="shim"))
template = TOOLS_JIT_TEMPLATE_C.read_text()
for case, opname in cases_and_opnames:
guard = re.match(r"#if MAX_CACHED_REGISTER >= (\d+)\n", case)
if guard and int(guard.group(1)) > _MAX_CACHED_REGISTER:
continue
# Write out a copy of the template with *only* this case
# inserted. This is about twice as fast as #include'ing all
# of executor_cases.c.h each time we compile (since the C
Expand Down