/claim #634
Closes #634
This PR addresses a resource leak in GitPolicyFetcher where repository objects were remaining in the internal repos cache even after a fetch operation failed (e.g., due to network issues or HTTP 500 errors from the git provider). This behavior could lead to “zombie” processes or lingering file descriptors/symbolic links as described in the issue.
Specific changes:
packages/opal-server/opal_server/git_fetcher.py:
fetch operation within fetch_and_notify_on_changes in a try...except block.GitPolicyFetcher.repos dictionary if an exception occurs during fetching.https://github.com/user-attachments/assets/49f52571-408d-4668-9ab3-eb8ec923491d
I verified this fix using a reproduction script that simulates a remote git failure (HTTP 500). Without the fix, the repository object remained in the cache. With the fix, the repository is correctly removed from the cache.
import asyncio
import os
import shutil
from pathlib import Path
import logging
import sys
import git
# Add packages to path
sys.path.append(str(Path(__file__).parent / "packages/opal-server"))
sys.path.append(str(Path(__file__).parent / "packages/opal-common"))
from opal_server.git_fetcher import GitPolicyFetcher, PolicyFetcherCallbacks
from opal_common.schemas.policy_source import GitPolicyScopeSource, NoAuthData
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class MockCallbacks(PolicyFetcherCallbacks):
async def on_update(self, old_head, head):
pass
async def run_test():
base_dir = Path("./temp_test_opal")
if base_dir.exists():
shutil.rmtree(base_dir)
base_dir.mkdir()
# Create a dummy local repo to serve as remote
remote_repo_dir = Path("./temp_remote_repo")
if remote_repo_dir.exists():
shutil.rmtree(remote_repo_dir)
remote_repo_dir.mkdir()
# Initialize repo
r = git.Repo.init(remote_repo_dir)
# Create a commit
(remote_repo_dir / "policy.rego").write_text("package system")
r.index.add(["policy.rego"])
r.index.commit("Initial commit")
# Use the local repo as source
source = GitPolicyScopeSource(
source_type="GIT",
url=str(remote_repo_dir.absolute()),
branch="master",
directories=["."],
auth=NoAuthData()
)
fetcher = GitPolicyFetcher(
base_dir=base_dir,
scope_id="test_scope",
source=source,
callbacks=MockCallbacks()
)
# First fetch should succeed
logger.info("Performing initial fetch...")
await fetcher.fetch_and_notify_on_changes(force_fetch=True)
logger.info("Initial fetch done.")
# Now change the remote URL in the local repo config to a bad HTTP URL
# We use GitPython to modify the config
repo_path = fetcher._repo_path
repo = git.Repo(str(repo_path))
bad_url = "https://httpstat.us/500/repo.git"
repo.remotes.origin.set_url(bad_url)
# Update fetcher source URL to match, so verification passes
fetcher._source.url = bad_url
# Initial FD count
initial_fds = len(os.listdir(f"/proc/{os.getpid()}/fd"))
logger.info(f"Initial FDs: {initial_fds}")
repo_path_str = str(fetcher._repo_path)
for i in range(20):
# Check cache status
# Note: We can't easily access GitPolicyFetcher.repos from here if it's not exposed,
# but we imported the class so we can access the static attribute.
try:
await fetcher.fetch_and_notify_on_changes(force_fetch=True)
except Exception as e:
# logger.info(f"Fetch failed as expected: {e}")
pass
if repo_path_str in GitPolicyFetcher.repos:
logger.warning("Repo still in cache!")
else:
logger.info("Repo removed from cache.")
current_fds = len(os.listdir(f"/proc/{os.getpid()}/fd"))
logger.info(f"Iteration {i+1}: FDs: {current_fds}")
final_fds = len(os.listdir(f"/proc/{os.getpid()}/fd"))
logger.info(f"Final FDs: {final_fds}")
if final_fds > initial_fds + 2: # Allow small fluctuation
logger.error(f"FD leak detected! {final_fds - initial_fds} leaked FDs.")
exit(1)
else:
logger.info("No FD leak detected.")
if __name__ == "__main__":
asyncio.run(run_test())
Felipe Avelar
@FelipeFMA
Permit.io
@Permitio