This class wraps PyGithub calls with caching, minimal traversal logic,
and robust parsing for YAML and Markdown with YAML front matter.
repo = g.get_repo("owner/name")
loader = GHContentLoader(repo)
site = loader.load_site()
projects = loader.load_projects()
All public methods return plain Python dicts/lists to keep this loader
decoupled from application‑specific models.
def __init__(self, repo, paths: Optional[GHContentPaths] = None, ref: Optional[str] = None, max_retries: int = 3):
"""Initialize the content loader.
repo: A PyGithub Repository instance.
paths: Optional path configuration. Defaults to GHContentPaths().
ref: Optional git reference (branch/tag/SHA). Defaults to paths.ref.
max_retries: Number of attempts for transient API errors.
self.paths = paths or GHContentPaths()
self.ref = ref or self.paths.ref
self.max_retries = max_retries
# Caches to minimize GitHub API calls within a single process lifetime.
self._cache_listdir: Dict[str, List[str]] = {}
self._cache_filetext: Dict[str, str] = {}
# ---------- public API ----------
def load_site(self) -> Dict[str, Any]:
"""Load and parse the site configuration file.
A mapping of the site.yml content. If site.yml were mistakenly a
Markdown file with YAML front matter, any remaining body would be
placed under the "body" key for completeness.
ValueError: If YAML is syntactically invalid or root is not a map.
github.GithubException: On non‑transient API errors.
path = f"{self.paths.root_dir}/{self.paths.site_file}"
text = self._fetch_file_text(path)
data, body = self._parse_front_matter_or_yaml_text(text, path)
# Preserve body if present (unexpected for site.yml but harmless).
data.setdefault("body", body)
def load_projects(self) -> List[Dict[str, Any]]:
"""Load the "projects" collection as a list of dicts."""
return self._load_collection(self.paths.projects_dir)
def load_team(self) -> List[Dict[str, Any]]:
"""Load the "team" collection as a list of dicts."""
return self._load_collection(self.paths.team_dir)
def load_collaborators(self) -> List[Dict[str, Any]]:
"""Load the "collaborators" collection as a list of dicts."""
return self._load_collection(self.paths.collaborators_dir)
def load_experiments(self) -> List[Dict[str, Any]]:
"""Load the "collaborators" collection as a list of dicts."""
return self._load_collection(self.paths.experiments_dir)
# ---------- internals ----------
def _load_collection(self, subdir: str) -> List[Dict[str, Any]]:
"""Load a collection directory of YAML/MD/MDX items.
subdir: Name of the subdirectory under root_dir to load.
A list of item dictionaries. Each item will have:
- slug: Either provided in front matter/YAML or derived from filename.
- _path: The repository path to the source file (for debugging).
- body: For Markdown items, the content after front matter.
base = f"{self.paths.root_dir}/{subdir}"
files = self._list_files(base, exts=(".yml", ".yaml", ".md", ".mdx"))
items: List[Dict[str, Any]] = []
text = self._fetch_file_text(f)
data, body = self._parse_front_matter_or_yaml_text(text, f)
# Ensure a stable slug exists even if not set explicitly.
data.setdefault("slug", data.get("slug") or self._slug_from_filename(f))
# Keep source path for traceability and debugging.
# Preserve body for Markdown content if not overridden in front matter.
if body is not None and "body" not in data:
def _list_files(self, dir_path: str, exts: Tuple[str, ...]) -> List[str]:
"""List all files under a directory (recursively) with matching extensions.
Uses the GitHub Contents API which returns either a list (for
directories) or a single object (for files). This function normalizes
that into a flat list of file paths and recurses into subdirectories.
Results are memoized for the process lifetime to reduce API calls.
dir_path: The repository path to list.
exts: Tuple of lowercase filename extensions to include.
A sorted list of repository file paths.
if dir_path in self._cache_listdir:
return list(self._cache_listdir[dir_path])
contents = self._safe_get_contents(dir_path)
# Non‑existent folders or unexpected errors are treated as empty.
# Depth‑first traversal using an explicit stack; the API returns either
# a single object or a list, so normalize to a list for iteration.
stack = contents if isinstance(contents, list) else [contents]
if getattr(entry, "type", None) == "dir":
sub = self._safe_get_contents(entry.path)
# Skip directories that intermittently fail to list.
stack.extend(sub if isinstance(sub, list) else [sub])
elif getattr(entry, "type", None) == "file" and entry.path.lower().endswith(exts):
self._cache_listdir[dir_path] = list(files)
def _safe_get_contents(self, path: str):
"""Wrapper around repo.get_contents with simple exponential backoff.
Retries common transient error codes. On final failure, re‑raises the
last caught exception for clarity.
for _ in range(self.max_retries):
return self.repo.get_contents(path, ref=self.ref)
except github.GithubException as e:
# 403/429: rate limiting; 5xx: transient server errors.
if getattr(e, 'status', None) in (403, 429, 500, 502, 503):
# Other GithubException: re‑raise immediately.
# Network/transport or unexpected errors — also retry briefly.
# Surface the last failure to the caller after exhausting retries.
def _fetch_file_text(self, path: str) -> str:
"""Fetch file contents as text, with caching and robust decoding.
The GitHub API typically provides decoded_content as bytes; however,
fallbacks are present for edge cases where manual base64 decoding is
necessary. Results are cached by path for the process lifetime.
if path in self._cache_filetext:
return self._cache_filetext[path]
cf = self._safe_get_contents(path)
data = cf.decoded_content
if isinstance(data, bytes):
# utf-8-sig handles optional BOM transparently.
text = data.decode("utf-8-sig")
# Some PyGithub versions may already provide a str.
# Fallback to manual base64 decode if needed
if isinstance(getattr(cf, "content", None), str):
text = base64.b64decode(cf.content).decode("utf-8-sig")
# As a last resort, decode with replacement to avoid crashes.
text = base64.b64decode(cf.content).decode("utf-8", errors="replace")
text = cf.decoded_content.decode("utf-8", errors="replace")
self._cache_filetext[path] = text
def _parse_front_matter_or_yaml_text(self, text: str, path_hint: str) -> Tuple[Dict[str, Any], Optional[str]]:
"""Parse text as YAML (for .yml/.yaml) or MD with optional front matter.
text: Raw file text from the repository.
path_hint: File path used to decide YAML vs Markdown parsing.
A tuple of (data, body). For YAML files, body is None. For Markdown
without front matter, data is an empty dict and body is the text.
if path_hint.lower().endswith((".yml", ".yaml")):
return self._load_yaml_str(text), None
# Markdown with optional YAML front matter
fm_yaml, body = m.group(1), m.group(2)
return self._load_yaml_str(fm_yaml), body
return {}, text # no front matter; whole file is body
def _load_yaml_str(s: str) -> Dict[str, Any]:
"""Load a YAML string into a dict with helpful validation.
Returns an empty dict for empty YAML. Raises a ValueError if the root
is not a mapping object to prevent accidental list/str roots.
except yaml.YAMLError as e:
raise ValueError(f"Invalid YAML: {e}") from e
if not isinstance(data, dict):
raise ValueError("YAML root must be a mapping/object.")
def _slug_from_filename(path: str) -> str:
"""Derive a URL‑safe slug from a repository file path.
Example: "content/projects/Sunrise Drive.yml" -> "sunrise-drive".
Falls back to the base filename (sans extension) if sanitization yields
# e.g., content/projects/sunrise drive.yml -> sunrise-drive
base = posixpath.basename(path)
name = base.rsplit(".", 1)[0]
slug = re.sub(r"[^a-z0-9]+", "-", name.lower()).strip("-")