Skip to content

Commit cbebbb7

Browse files
committed
feat: improve storage for replays files
1 parent 648bbbf commit cbebbb7

7 files changed

Lines changed: 135 additions & 49 deletions

File tree

backend/game/game-architecture.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,7 @@ ronin/
384384
├── Makefile
385385
└── backend/
386386
├── shared/
387+
│ ├── storage.py # ReplayStorage protocol, LocalReplayStorage (gzip file persistence with two-level shard directories), replay_file_path helper
387388
│ ├── dal/
388389
│ │ ├── __init__.py # Public API: PlayerRepository, GameRepository, PlayedGame
389390
│ │ ├── models.py # PlayedGame persistence model

backend/game/replay/external/tenhou.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -587,7 +587,7 @@ def main() -> None:
587587
old_file.unlink()
588588
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
589589

590-
replay_files = sorted(REPLAYS_DIR.glob("*.txt.gz"))
590+
replay_files = sorted(REPLAYS_DIR.glob("??/??/*.txt.gz"))
591591
if not replay_files:
592592
logger.info("No replay files found in %s", REPLAYS_DIR)
593593
return

backend/lobby/lobby-architecture.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ Portal service for room management, authentication, and game client serving.
1818

1919
### Public (replay access)
2020
- `GET /play/history/{game_id}` - Game client HTML page for replay viewing (serves same play.html template, no auth required)
21-
- `GET /api/replays/{game_id}` - Replay API endpoint returning gzip-compressed NDJSON replay content (`Content-Encoding: gzip`, `Cache-Control: immutable`). Returns 404 for invalid/nonexistent game IDs. Rate-limited via Traefik in production
21+
- `GET /api/replays/{game_id}` - Replay API endpoint returning gzip-compressed NDJSON replay content (`Content-Encoding: gzip`, `Cache-Control: immutable`). Requires game_id to be at least 4 characters; returns 404 for invalid/nonexistent game IDs. Rate-limited via Traefik in production
2222

2323
### Protected (session cookie or API key required)
2424
- `GET /` - Lobby HTML page (server-rendered, lists rooms from local room manager)
@@ -174,7 +174,7 @@ These practices are mandatory for all lobby code. Violations must be caught in c
174174
- **Views** (`views/`) - Jinja2 templates and view handlers split by domain:
175175
- `handlers.py` — Lobby, room, and matchmaking page handlers (`lobby_page`, `room_page`, `matchmaking_page`, `create_room_and_redirect`, `join_room_and_redirect`)
176176
- `history_handlers.py` — History page handler and game data transformation (`history_page`, `_format_duration`, `_prepare_history_for_display`)
177-
- `replay_handlers.py` — Replay API handler (`replay_content`); reads gzip-compressed replay files with path traversal protection and file size limits
177+
- `replay_handlers.py` — Replay API handler (`replay_content`); resolves sharded replay file paths via `shared.storage.replay_file_path`, enforces minimum game ID length, path traversal protection, and file size limits
178178
- `game_handlers.py` — Game client and dev page handlers (`play_page`, `styleguide_page`)
179179
- `assets.py` — Vite manifest utilities and Jinja2 template factory (`create_templates`, `load_vite_manifest`, `resolve_vite_asset_urls`)
180180
- `auth_handlers.py` — Auth handlers (login, register, logout, bot_auth, bot_create_room, bot_matchmaking_auth)
@@ -190,6 +190,7 @@ Dependencies on `shared/`:
190190
- `shared.validators` - String list parsing (CORS origins, allowed hosts) and custom env settings source
191191
- `shared.auth` - `AuthService`, `AuthSessionStore`, `PlayerRepository` for player management; `create_signed_ticket` and `sign_game_ticket` for HMAC-signed game tickets
192192
- `shared.db` - `Database`, `SqlitePlayerRepository` for SQLite-backed player storage, `SqliteGameRepository` for played game queries
193+
- `shared.storage` - `replay_file_path` for resolving sharded replay file paths, `_MIN_GAME_ID_LEN` for game ID validation
193194

194195
## Project Structure
195196

@@ -306,7 +307,7 @@ Lobby settings (prefixed with `LOBBY_`):
306307
- `LOBBY_GAME_CLIENT_URL` - Game client URL for room creation redirects and join links (default: `/play`)
307308
- `LOBBY_WS_ALLOWED_ORIGIN` - Allowed origin for WebSocket connections (CSRF protection). Default: `http://localhost:8710`. Set to `None` to allow all origins
308309
- `LOBBY_GAME_ASSETS_DIR` - Directory containing built game client assets and `.vite/manifest.json` (default: `frontend/dist`)
309-
- `LOBBY_REPLAY_DIR` - Directory containing gzip-compressed replay files (default: `backend/data/replays`)
310+
- `LOBBY_REPLAY_DIR` - Root directory for gzip-compressed replay files distributed across a two-level shard structure `{id[0:2]}/{id[2:4]}/{game_id}.txt.gz` (default: `backend/data/replays`)
310311
- `LOBBY_VITE_DEV_URL` - Vite dev server URL for HMR in development (default: empty; set to `http://localhost:5173` when running Vite dev server)
311312
312313
Auth settings (prefixed with `AUTH_`):

backend/lobby/tests/unit/test_replay_handlers.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from lobby.server.settings import LobbyServerSettings
1010
from lobby.views.replay_handlers import _load_replay
1111
from shared.auth.settings import AuthSettings
12+
from shared.storage import replay_file_path
1213

1314

1415
def _make_client(tmp_path):
@@ -37,9 +38,11 @@ def _make_client(tmp_path):
3738

3839

3940
def _write_gzip_replay(replay_dir, game_id, content):
40-
"""Write a gzip-compressed replay file."""
41+
"""Write a gzip-compressed replay file at the sharded path."""
42+
target = replay_file_path(replay_dir, game_id)
43+
target.parent.mkdir(parents=True, exist_ok=True)
4144
compressed = gzip.compress(content.encode("utf-8"))
42-
(replay_dir / f"{game_id}.txt.gz").write_bytes(compressed)
45+
target.write_bytes(compressed)
4346

4447

4548
class TestReplayContent:
@@ -107,6 +110,11 @@ def test_game_id_too_long_returns_404(self, setup):
107110
response = client.get(f"/api/replays/{'a' * 51}")
108111
assert response.status_code == 404
109112

113+
def test_game_id_too_short_returns_404(self, setup):
114+
client, _replay_dir = setup
115+
response = client.get("/api/replays/abc")
116+
assert response.status_code == 404
117+
110118
def test_game_id_at_max_length_is_accepted(self, setup):
111119
"""A 50-char game_id should be accepted (if file exists)."""
112120
client, replay_dir = setup
@@ -126,33 +134,40 @@ def test_game_id_with_url_encoded_traversal_returns_404(self, setup):
126134
def test_oversized_file_returns_404(self, setup):
127135
"""Files exceeding 1 MB on disk are rejected."""
128136
client, replay_dir = setup
137+
game_id = "big-game"
138+
target = replay_file_path(replay_dir, game_id)
139+
target.parent.mkdir(parents=True, exist_ok=True)
129140
# Write raw bytes > 1 MB (not valid gzip, but size check happens first)
130-
(replay_dir / "big-game.txt.gz").write_bytes(b"\x00" * (1_048_576 + 1))
141+
target.write_bytes(b"\x00" * (1_048_576 + 1))
131142

132-
response = client.get("/api/replays/big-game")
143+
response = client.get(f"/api/replays/{game_id}")
133144
assert response.status_code == 404
134145

135146
def test_unreadable_file_returns_404(self, setup):
136147
"""Files that exist but can't be read return 404."""
137148
client, replay_dir = setup
138-
target = replay_dir / "broken.txt.gz"
139-
target.mkdir() # directory, not a file — read_bytes will fail
149+
game_id = "broken_file"
150+
target = replay_file_path(replay_dir, game_id)
151+
target.parent.mkdir(parents=True, exist_ok=True)
152+
target.mkdir() # directory, not a file — read will fail
140153

141-
response = client.get("/api/replays/broken")
154+
response = client.get(f"/api/replays/{game_id}")
142155
assert response.status_code == 404
143156

144157
def test_path_traversal_via_symlink_returns_404(self, setup, tmp_path):
145158
"""A game_id whose resolved path escapes the replay dir is rejected."""
146159
_client, replay_dir = setup
160+
game_id = "escape_link"
147161
# Create a file outside the replay dir
148162
outside = tmp_path / "secret.txt.gz"
149163
outside.write_bytes(gzip.compress(b"secret"))
150-
# Create a symlink inside the replay dir pointing outside
151-
link = replay_dir / "escape.txt.gz"
152-
link.symlink_to(outside)
164+
# Create a symlink at the sharded path pointing outside
165+
target = replay_file_path(replay_dir, game_id)
166+
target.parent.mkdir(parents=True, exist_ok=True)
167+
target.symlink_to(outside)
153168

154169
# Call the sync helper directly — the symlink resolves outside replay_dir
155-
result = _load_replay(str(replay_dir), "escape")
170+
result = _load_replay(str(replay_dir), game_id)
156171
assert result is None
157172

158173
def test_replay_page_route_serves_play_page(self, setup):

backend/lobby/views/replay_handlers.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
from starlette.responses import Response
1111

12+
from shared.storage import _MIN_GAME_ID_LEN, replay_file_path
13+
1214
if TYPE_CHECKING:
1315
from starlette.requests import Request
1416

@@ -25,11 +27,11 @@
2527
def _load_replay(replay_dir_str: str, game_id: str) -> bytes | None:
2628
"""Resolve paths and read a gzip-compressed replay file.
2729
28-
Returns the raw gzip bytes, or None if the file is missing, too large,
30+
Return the raw gzip bytes, or None if the file is missing, too large,
2931
or the game_id resolves outside the replay directory.
3032
"""
3133
replay_dir = Path(replay_dir_str).resolve()
32-
target = (replay_dir / f"{game_id}.txt.gz").resolve()
34+
target = replay_file_path(replay_dir, game_id).resolve()
3335

3436
if not target.is_relative_to(replay_dir):
3537
return None
@@ -50,7 +52,12 @@ async def replay_content(request: Request) -> Response:
5052
"""GET /api/replays/{game_id} — serve a gzip-compressed replay file."""
5153
game_id = request.path_params["game_id"]
5254

53-
if not game_id or len(game_id) > _GAME_ID_MAX_LEN or not _GAME_ID_RE.match(game_id):
55+
if (
56+
not game_id
57+
or len(game_id) < _MIN_GAME_ID_LEN
58+
or len(game_id) > _GAME_ID_MAX_LEN
59+
or not _GAME_ID_RE.match(game_id)
60+
):
5461
return _NOT_FOUND
5562

5663
replay_dir_str: str = request.app.state.settings.replay_dir

backend/shared/storage.py

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,12 @@
22
33
Replay files are gzip-compressed NDJSON containing full game event logs.
44
Replays are public and served to any user via the lobby replay API.
5-
Files are written with owner-only permissions (0o600) inside an
6-
owner-only directory (0o700) as a filesystem hygiene measure.
5+
Files are written with owner-only permissions (0o600) inside owner-only
6+
directories (0o700) as a filesystem hygiene measure.
7+
8+
Replay files are distributed across a two-level directory structure using
9+
the first 4 characters of the game ID as shard prefixes:
10+
``{replay_dir}/{id[0:2]}/{id[2:4]}/{game_id}.txt.gz``
711
"""
812

913
import contextlib
@@ -23,6 +27,23 @@
2327
# Owner-only file permissions for replay data files.
2428
_REPLAY_FILE_MODE = 0o600
2529

30+
# Game IDs shorter than this cannot produce a two-level shard prefix.
31+
_MIN_GAME_ID_LEN = 4
32+
33+
34+
def replay_file_path(replay_dir: Path, game_id: str) -> Path:
35+
"""Build the sharded file path for a replay.
36+
37+
Use the first 4 characters of game_id as a two-level directory
38+
prefix (2 chars each) to distribute files across subdirectories.
39+
Require game_id to be at least 4 characters long.
40+
"""
41+
if len(game_id) < _MIN_GAME_ID_LEN:
42+
raise ValueError(f"game_id must be at least {_MIN_GAME_ID_LEN} characters, got {len(game_id)}")
43+
prefix_a = game_id[:2]
44+
prefix_b = game_id[2:4]
45+
return replay_dir / prefix_a / prefix_b / f"{game_id}.txt.gz"
46+
2647

2748
class ReplayStorage(Protocol):
2849
"""Protocol for persisting replay data."""
@@ -31,10 +52,11 @@ def save_replay(self, game_id: str, content: str) -> None: ...
3152

3253

3354
class LocalReplayStorage:
34-
"""Writes gzip-compressed replay files to the local filesystem.
55+
"""Write gzip-compressed replay files to the local filesystem.
3556
36-
Files are created with owner-only read/write (0o600) inside an
37-
owner-only directory (0o700) as a filesystem hygiene measure.
57+
Files are created with owner-only read/write (0o600) inside owner-only
58+
directories (0o700) as a filesystem hygiene measure. Replay files are
59+
distributed across a two-level shard directory structure.
3860
"""
3961

4062
def __init__(self, replay_dir: str) -> None:
@@ -43,21 +65,26 @@ def __init__(self, replay_dir: str) -> None:
4365
def save_replay(self, game_id: str, content: str) -> None:
4466
"""Save gzip-compressed replay content under the configured directory.
4567
46-
Creates the directory lazily on first write with owner-only permissions
47-
(0o700). Writes replay files atomically via temp-file-then-rename with
48-
owner-only permissions (0o600). Rejects path traversal attempts that
49-
would place the file outside the replay root.
68+
Create shard directories lazily on first write with owner-only
69+
permissions (0o700). Write replay files atomically via
70+
temp-file-then-rename with owner-only permissions (0o600). Reject
71+
path traversal attempts that would place the file outside the replay
72+
root.
5073
"""
51-
target = (self._replay_dir / f"{game_id}.txt.gz").resolve()
74+
target = replay_file_path(self._replay_dir, game_id).resolve()
5275
if not target.is_relative_to(self._replay_dir):
5376
raise ValueError(f"Path traversal rejected: '{game_id}' resolves outside replay directory")
5477

55-
self._replay_dir.mkdir(mode=_REPLAY_DIR_MODE, parents=True, exist_ok=True)
56-
self._replay_dir.chmod(_REPLAY_DIR_MODE)
78+
shard_dir = target.parent
79+
shard_dir.mkdir(mode=_REPLAY_DIR_MODE, parents=True, exist_ok=True)
80+
# Ensure owner-only permissions on all directory levels (mkdir
81+
# parents=True applies mode only to the leaf; umask may weaken it).
82+
for directory in (self._replay_dir, shard_dir.parent, shard_dir):
83+
directory.chmod(_REPLAY_DIR_MODE)
5784

5885
compressed = gzip.compress(content.encode("utf-8"))
5986

60-
fd, tmp_path = tempfile.mkstemp(dir=str(self._replay_dir), suffix=".tmp", prefix=".replay_")
87+
fd, tmp_path = tempfile.mkstemp(dir=str(shard_dir), suffix=".tmp", prefix=".replay_")
6188
fd_owned = True
6289
try:
6390
with os.fdopen(fd, "wb") as f:

0 commit comments

Comments
 (0)