From cf2d4a2b2631ba7fec1b29976aa5024966b5a323 Mon Sep 17 00:00:00 2001 From: Robert Seedorff Date: Sat, 13 Mar 2021 23:40:26 +0100 Subject: [PATCH 1/9] Added a new configuration option to obay the GitHub and GitLab ratelimits. --- scanners/git-repo-scanner/README.md.gotmpl | 24 ++++++----- .../scanner/git_repo_scanner.py | 42 ++++++++++++++----- .../scanner/git_repo_scanner_test.py | 1 + .../git-repo-scanner/scanner/requirements.txt | 4 +- 4 files changed, 47 insertions(+), 24 deletions(-) diff --git a/scanners/git-repo-scanner/README.md.gotmpl b/scanners/git-repo-scanner/README.md.gotmpl index b573606f14..697ce17740 100644 --- a/scanners/git-repo-scanner/README.md.gotmpl +++ b/scanners/git-repo-scanner/README.md.gotmpl @@ -32,22 +32,24 @@ or ``` #### GitHub -For type github you can use the following options: -- `--organization`: The name of the github organization you want to scan. -- `--url`: The url of the api for a github enterprise server. Skip this option for repos on . -- `--access-token`: Your personal github access token. -- `--ignore-repos`: A list of github repository ids you want to ignore +For type GitHub you can use the following options: +- `--organization`: The name of the GitHub organization you want to scan. +- `--url`: The url of the api for a GitHub enterprise server. Skip this option for repos on . +- `--access-token`: Your personal GitHub access token. +- `--ignore-repos`: A list of GitHub repository ids you want to ignore +- `--obey-rate-limit`: True to obey the rate limit of the GitHub server (default), otherwise False For now only organizations are supported so the option is mandatory. We **strongly recommend** providing an access token for authentication. If not provided the rate limiting will kick in after about 30 repositories scanned. #### GitLab -For type gitlab you can use the following options: -- `--url`: The url of the gitlab server. -- `--access-token`: Your personal gitlab access token. -- `--group`: A specific gitlab group id you want to san, including subgroups. -- `--ignore-groups`: A list of gitlab group ids you want to ignore -- `--ignore-repos`: A list of gitlab project ids you want to ignore +For type GitLab you can use the following options: +- `--url`: The url of the GitLab server. +- `--access-token`: Your personal GitLab access token. +- `--group`: A specific GitLab group id you want to san, including subgroups. +- `--ignore-groups`: A list of GitLab group ids you want to ignore +- `--ignore-repos`: A list of GitLab project ids you want to ignore +- `--obey-rate-limit`: True to obey the rate limit of the GitLab server (default), otherwise False For gitlab the url and the access token is mandatory. If you don't provide a specific group id all projects on the gitlab server are going to be discovered. diff --git a/scanners/git-repo-scanner/scanner/git_repo_scanner.py b/scanners/git-repo-scanner/scanner/git_repo_scanner.py index 7f111bfdf6..aa3f7c67ff 100644 --- a/scanners/git-repo-scanner/scanner/git_repo_scanner.py +++ b/scanners/git-repo-scanner/scanner/git_repo_scanner.py @@ -2,6 +2,8 @@ import logging import sys import json +import calendar +import time from typing import List from pathlib import Path @@ -59,22 +61,22 @@ def write_findings_to_file(args, findings): def get_parser_args(args=None): parser = argparse.ArgumentParser(description='Scan public or private git repositories of organizations or groups') parser.add_argument('--git-type', - help='Repository type can be github or gitlab', + help='Repository type can be github or GitLab', choices=['github', 'gitlab'], required=True) parser.add_argument('--file-output', help='The path of the output file', required=True), - parser.add_argument('--url', help='The gitlab url or a github enterprise api url.', + parser.add_argument('--url', help='The GitLab url or a GitHub enterprise api url.', required=False) parser.add_argument('--access-token', help='An access token for authentication', required=False) parser.add_argument('--organization', - help='The name of the githup organization to scan', + help='The name of the GitHub organization to scan', required=False) parser.add_argument('--group', - help='The id of the gitlab group to scan', + help='The id of the GitLab group to scan', required=False) parser.add_argument('--ignore-repos', help='A list of repo ids to ignore', @@ -84,12 +86,18 @@ def get_parser_args(args=None): default=[], required=False) parser.add_argument('--ignore-groups', - help='A list of gitlab group ids to ignore', + help='A list of GitLab group ids to ignore', action='extend', nargs='+', type=int, default=[], required=False) + parser.add_argument('--obey-rate-limit', + help='True to obey the rate limit of the GitLab or GitHub server (default), otherwise False', + type=bool, + default=True, + required=False) + if args: return parser.parse_args(args) else: @@ -99,7 +107,7 @@ def get_parser_args(args=None): def parse_gitlab(args): gl: gitlab.Gitlab if not args.url: - logger.info(' URL required for gitlab connection.') + logger.info(' URL required for GitLab connection.') sys.exit(-1) logger.info(' Gitlab authentication...') @@ -128,12 +136,12 @@ def process_gitlab_projects(args, projects): def get_gitlab_projects(args, gl): if args.group: try: - projects = gl.groups.get(args.group).projects.list(all=True, include_subgroups=True) + projects = gl.groups.get(args.group).projects.list(all=True, include_subgroups=True, obey_rate_limit=args.obey_rate_limit) except gitlab.exceptions.GitlabGetError: logger.info(' Group does not exist.') sys.exit(-1) else: - projects = gl.projects.list(all=True, max_retries=12) + projects = gl.projects.list(all=True, max_retries=12, obey_rate_limit=args.obey_rate_limit) return projects @@ -146,7 +154,7 @@ def gitlab_authenticate(args): except gitlab.exceptions.GitlabAuthenticationError: gl = gitlab_authenticate_oauth(args) else: - logger.info(' Access token required for gitlab authentication.') + logger.info(' Access token required for GitLab authentication.') sys.exit(-1) logger.info(' Success') return gl @@ -174,22 +182,34 @@ def parse_github(args): logger.info(' No organization provided') sys.exit(-1) +def respect_github_ratelimit(args, gh): + if args.obey_rate_limit: + api_limit = gh.get_rate_limit().core + reset_timestamp = calendar.timegm(api_limit.reset.timetuple()) + seconds_until_reset = reset_timestamp - calendar.timegm(time.gmtime()) + 5 # add 5 seconds to be sure the rate limit has been reset + sleep_time = seconds_until_reset / api_limit.remaining + + logger.info(' Checking Rate-Limit ('+ str(args.obey_rate_limit) +') [remainingApiCalls: ' + str(api_limit.remaining) + ', seconds_until_reset: ' + str(seconds_until_reset) + ', sleepTime: ' + str(sleep_time) + ']') + time.sleep(sleep_time) def process_github_repos(args, gh): findings = [] org: Organization = gh.get_organization(args.organization) repos: PaginatedList[Repository] = org.get_repos(type='all') for i in range(repos.totalCount): - process_github_repos_page(args, findings, repos.get_page(i)) + process_github_repos_page(args, findings, repos.get_page(i), gh) return findings -def process_github_repos_page(args, findings, repos): +def process_github_repos_page(args, findings, repos, gh): repo: Repository for repo in repos: if repo.id not in args.ignore_repos: logger.info(f' {len(findings) + 1} - {repo.name}') + findings.append(create_finding_github(repo)) + respect_github_ratelimit(args, gh) + def setup_github(args): diff --git a/scanners/git-repo-scanner/scanner/git_repo_scanner_test.py b/scanners/git-repo-scanner/scanner/git_repo_scanner_test.py index 5724657a06..71f2247039 100644 --- a/scanners/git-repo-scanner/scanner/git_repo_scanner_test.py +++ b/scanners/git-repo-scanner/scanner/git_repo_scanner_test.py @@ -94,6 +94,7 @@ def test_parse_github_with_no_org_should_exit(self): def get_args(ignore_groups=0, ignore_projects=0, url=None, access_token=None, org=None): args = ['--git-type', 'gitlab', '--file-output', 'out', + '--obey-rate-limit', False, '--ignore-repos', str(ignore_projects), '--ignore-groups', str(ignore_groups)] if url: diff --git a/scanners/git-repo-scanner/scanner/requirements.txt b/scanners/git-repo-scanner/scanner/requirements.txt index 3140bb4c74..95198b0130 100644 --- a/scanners/git-repo-scanner/scanner/requirements.txt +++ b/scanners/git-repo-scanner/scanner/requirements.txt @@ -1,4 +1,4 @@ -PyGithub == 1.53 -python-gitlab == 2.5.0 +PyGithub == 1.54.1 +python-gitlab == 2.6.0 munch == 2.5.0 mock == 4.0.2 From 8cf7bb1ccd152f6723a31a72ea5e2a46696d22a6 Mon Sep 17 00:00:00 2001 From: rseedorff Date: Sat, 13 Mar 2021 22:41:20 +0000 Subject: [PATCH 2/9] Updating Helm Docs --- scanners/git-repo-scanner/README.md | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/scanners/git-repo-scanner/README.md b/scanners/git-repo-scanner/README.md index 89702467e1..ebcd88b534 100644 --- a/scanners/git-repo-scanner/README.md +++ b/scanners/git-repo-scanner/README.md @@ -31,22 +31,24 @@ or ``` #### GitHub -For type github you can use the following options: -- `--organization`: The name of the github organization you want to scan. -- `--url`: The url of the api for a github enterprise server. Skip this option for repos on . -- `--access-token`: Your personal github access token. -- `--ignore-repos`: A list of github repository ids you want to ignore +For type GitHub you can use the following options: +- `--organization`: The name of the GitHub organization you want to scan. +- `--url`: The url of the api for a GitHub enterprise server. Skip this option for repos on . +- `--access-token`: Your personal GitHub access token. +- `--ignore-repos`: A list of GitHub repository ids you want to ignore +- `--obey-rate-limit`: True to obey the rate limit of the GitHub server (default), otherwise False For now only organizations are supported so the option is mandatory. We **strongly recommend** providing an access token for authentication. If not provided the rate limiting will kick in after about 30 repositories scanned. #### GitLab -For type gitlab you can use the following options: -- `--url`: The url of the gitlab server. -- `--access-token`: Your personal gitlab access token. -- `--group`: A specific gitlab group id you want to san, including subgroups. -- `--ignore-groups`: A list of gitlab group ids you want to ignore -- `--ignore-repos`: A list of gitlab project ids you want to ignore +For type GitLab you can use the following options: +- `--url`: The url of the GitLab server. +- `--access-token`: Your personal GitLab access token. +- `--group`: A specific GitLab group id you want to san, including subgroups. +- `--ignore-groups`: A list of GitLab group ids you want to ignore +- `--ignore-repos`: A list of GitLab project ids you want to ignore +- `--obey-rate-limit`: True to obey the rate limit of the GitLab server (default), otherwise False For gitlab the url and the access token is mandatory. If you don't provide a specific group id all projects on the gitlab server are going to be discovered. From d6d696c02850a4c384df51609c4fab563ed8f337 Mon Sep 17 00:00:00 2001 From: Robert Seedorff Date: Sun, 14 Mar 2021 02:39:55 +0100 Subject: [PATCH 3/9] Added a new configuration option to filter git repos by latest acivity date. --- scanners/git-repo-scanner/README.md.gotmpl | 9 ++ .../scanner/git_repo_scanner.py | 148 ++++++++++++++++-- .../scanner/git_repo_scanner_test.py | 32 ++-- .../git-repo-scanner/scanner/requirements.txt | 2 + 4 files changed, 160 insertions(+), 31 deletions(-) diff --git a/scanners/git-repo-scanner/README.md.gotmpl b/scanners/git-repo-scanner/README.md.gotmpl index 697ce17740..02ab3188cd 100644 --- a/scanners/git-repo-scanner/README.md.gotmpl +++ b/scanners/git-repo-scanner/README.md.gotmpl @@ -38,6 +38,10 @@ For type GitHub you can use the following options: - `--access-token`: Your personal GitHub access token. - `--ignore-repos`: A list of GitHub repository ids you want to ignore - `--obey-rate-limit`: True to obey the rate limit of the GitHub server (default), otherwise False +- `--activity-since-duration`: Return git repo findings with repo activity (e.g. commits) more recent than a specific date expresed by an duration (now + duration). A duration string is a possibly signed sequence of decimal numbers, each + with optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm', 'h', 'd', 'w'. +- `--activity-until-duration`: Return git repo findings with repo activity (e.g. commits) older than a specific date expresed by an duration (now + duration). A duration string is a possibly signed sequence of decimal numbers, each with + optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm', 'h', 'd', 'w'. For now only organizations are supported so the option is mandatory. We **strongly recommend** providing an access token for authentication. If not provided the rate limiting will kick in after about 30 repositories scanned. @@ -50,6 +54,11 @@ For type GitLab you can use the following options: - `--ignore-groups`: A list of GitLab group ids you want to ignore - `--ignore-repos`: A list of GitLab project ids you want to ignore - `--obey-rate-limit`: True to obey the rate limit of the GitLab server (default), otherwise False +- `--activity-since-duration`: Return git repo findings with repo activity (e.g. commits) more recent than a specific date expresed by an duration (now + duration). A duration string is a possibly signed sequence of decimal numbers, each + with optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm', 'h', 'd', 'w'. +- `--activity-until-duration`: Return git repo findings with repo activity (e.g. commits) older than a specific date expresed by an duration (now + duration). A duration string is a possibly signed sequence of decimal numbers, each with + optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm', 'h', 'd', 'w'. + For gitlab the url and the access token is mandatory. If you don't provide a specific group id all projects on the gitlab server are going to be discovered. diff --git a/scanners/git-repo-scanner/scanner/git_repo_scanner.py b/scanners/git-repo-scanner/scanner/git_repo_scanner.py index aa3f7c67ff..e762690ad9 100644 --- a/scanners/git-repo-scanner/scanner/git_repo_scanner.py +++ b/scanners/git-repo-scanner/scanner/git_repo_scanner.py @@ -4,9 +4,17 @@ import json import calendar import time +from datetime import datetime +import pytz + from typing import List from pathlib import Path +# https://pypi.org/project/pytimeparse/ +from pytimeparse.timeparse import timeparse +# https://docs.python.org/3/library/datetime.html +from datetime import timedelta + import gitlab from gitlab.v4.objects import Project @@ -97,6 +105,14 @@ def get_parser_args(args=None): type=bool, default=True, required=False) + parser.add_argument('--activity-since-duration', + help='Return git repo findings with repo activity (e.g. commits) more recent than a specific date expresed by an duration (now + duration)', + type=str, + required=False) + parser.add_argument('--activity-until-duration', + help='Return git repo findings with repo activity (e.g. commits) older than a specific date expresed by an duration (now + duration)', + type=str, + required=False) if args: return parser.parse_args(args) @@ -109,31 +125,70 @@ def parse_gitlab(args): if not args.url: logger.info(' URL required for GitLab connection.') sys.exit(-1) - logger.info(' Gitlab authentication...') + logger.info(' Gitlab authentication...') gl = gitlab_authenticate(args) - projects: List[Project] = get_gitlab_projects(args, gl) + logger.info(' Gitlab retrieve all repositories...') + now_utc = pytz.utc.localize(datetime.utcnow()) + # Respect time filtering based on "pushed_at" (not "updated_at") + # The difference is that "pushed_at" represents the date and time of the last commit, whereas the "updated_at" represents the date and time of the last change the the repository. + # A change to the repository might be a commit, but it may also be other things, such as changing the description of the repo, creating wiki pages, etc. + # In other words, commits are a subset of updates, and the pushed_at timestamp will therefore either be the same as the updated_at timestamp, or it will be an earlier timestamp. + duration = 0 + activityDeltaDatetime = now_utc + if args.activity_since_duration: + activityDuration = timeparse(args.activity_since_duration) + activityDeltaDatetime = timedelta(seconds=activityDuration) + logger.info(' Get all GitLab Repos (filtered by last activity since '+ str(activityDeltaDatetime) +' ago.)') + + projects: List[Project] = get_gitlab_projects_active_since(args, gl) + elif args.activity_until_duration: + activityDuration = timeparse(args.activity_until_duration) + activityDeltaDatetime = timedelta(seconds=activityDuration) + logger.info(' Get all GitLab Repos (filtered by last activity until '+ str(activityDeltaDatetime) +' ago.)') + + projects: List[Project] = get_gitlab_projects_active_until(args, gl) + else: + logger.info(' Get all Gitlab Repos (not filtered)') + projects: List[Project] = get_gitlab_projects_all(args, gl) logger.info(' Process Projects...') - - findings = process_gitlab_projects(args, projects) + activityDate = now_utc - activityDeltaDatetime + findings = process_gitlab_projects(args, projects, activityDeltaDatetime, activityDate) return findings -def process_gitlab_projects(args, projects): +def process_gitlab_projects(args, projects, activityDeltaDatetime, activityDate): findings = [] i = 1 for project in projects: if is_not_on_ignore_list_gitlab(project, args.ignore_groups, args.ignore_repos): - logger.info(f' {i} - {project.name}') + lastUpDatetime = datetime.fromisoformat(project.last_activity_at) + logger.info(f' {i} - Name: {project.name} - LastUpdate: {lastUpDatetime}') i += 1 - findings.append(create_finding_gitlab(project)) + + # respect time filtering + if args.activity_since_duration: + if lastUpDatetime > activityDate: + findings.append(create_finding_gitlab(project)) + else: + logger.info(f' Reached activity limit! Ignoring all repos with latest activity since `{activityDeltaDatetime}` ago ({ str(activityDate) }).') + break + elif args.activity_until_duration: + if lastUpDatetime < activityDate: + findings.append(create_finding_gitlab(project)) + else: + logger.info(f' Reached activity limit! Ignoring all repos with latest activity until `{activityDeltaDatetime}` ago ({ str(activityDate) }).') + break + else: + findings.append(create_finding_gitlab(project)) + return findings -def get_gitlab_projects(args, gl): +def get_gitlab_projects_all(args, gl): if args.group: try: projects = gl.groups.get(args.group).projects.list(all=True, include_subgroups=True, obey_rate_limit=args.obey_rate_limit) @@ -144,6 +199,28 @@ def get_gitlab_projects(args, gl): projects = gl.projects.list(all=True, max_retries=12, obey_rate_limit=args.obey_rate_limit) return projects +def get_gitlab_projects_active_since(args, gl): + if args.group: + try: + projects = gl.groups.get(args.group).projects.list(all=True, include_subgroups=True, order_by='last_activity_at', sort='desc', obey_rate_limit=args.obey_rate_limit) + except gitlab.exceptions.GitlabGetError: + logger.info(' Group does not exist.') + sys.exit(-1) + else: + projects = gl.projects.list(all=True, max_retries=12, order_by='last_activity_at', sort='desc', obey_rate_limit=args.obey_rate_limit) + return projects + +def get_gitlab_projects_active_until(args, gl): + if args.group: + try: + projects = gl.groups.get(args.group).projects.list(all=True, include_subgroups=True, order_by='last_activity_at', sort='asc', obey_rate_limit=args.obey_rate_limit) + except gitlab.exceptions.GitlabGetError: + logger.info(' Group does not exist.') + sys.exit(-1) + else: + projects = gl.projects.list(all=True, max_retries=12, order_by='last_activity_at', sort='asc', obey_rate_limit=args.obey_rate_limit) + return projects + def gitlab_authenticate(args): gl: gitlab.Gitlab @@ -195,22 +272,59 @@ def respect_github_ratelimit(args, gh): def process_github_repos(args, gh): findings = [] org: Organization = gh.get_organization(args.organization) - repos: PaginatedList[Repository] = org.get_repos(type='all') + + # Respect time filtering based on "pushed_at" (not "updated_at") + # The difference is that "pushed_at" represents the date and time of the last commit, whereas the "updated_at" represents the date and time of the last change the the repository. + # A change to the repository might be a commit, but it may also be other things, such as changing the description of the repo, creating wiki pages, etc. + # In other words, commits are a subset of updates, and the pushed_at timestamp will therefore either be the same as the updated_at timestamp, or it will be an earlier timestamp. + duration = 0 + activityDeltaDatetime = datetime.now() + if args.activity_since_duration: + activityDuration = timeparse(args.activity_since_duration) + activityDeltaDatetime = timedelta(seconds=activityDuration) + logger.info(' Get all GitHub Repos (filtered by last activity since '+ str(activityDeltaDatetime) +' ago.)') + + repos: PaginatedList[Repository] = org.get_repos(type='all', sort='pushed', direction='desc') + elif args.activity_until_duration: + activityDuration = timeparse(args.activity_until_duration) + activityDeltaDatetime = timedelta(seconds=activityDuration) + logger.info(' Get all GitHub Repos (filtered by last activity until '+ str(activityDeltaDatetime) +' ago.)') + + repos: PaginatedList[Repository] = org.get_repos(type='all', sort='pushed', direction='asc') + else: + logger.info(' Get all GitHub Repos (not filtered)') + repos: PaginatedList[Repository] = org.get_repos(type='all') + + activityDate = datetime.now() - activityDeltaDatetime + for i in range(repos.totalCount): - process_github_repos_page(args, findings, repos.get_page(i), gh) + process_github_repos_page(args, findings, repos.get_page(i), gh, activityDeltaDatetime, activityDate) return findings - -def process_github_repos_page(args, findings, repos, gh): +def process_github_repos_page(args, findings, repos, gh, activityDeltaDatetime, activityDate): repo: Repository for repo in repos: if repo.id not in args.ignore_repos: - logger.info(f' {len(findings) + 1} - {repo.name}') + logger.info(f' {len(findings) + 1} - Name: {repo.name} - LastUpdate: {repo.updated_at} - LastPush: {repo.pushed_at}') - findings.append(create_finding_github(repo)) - respect_github_ratelimit(args, gh) - - + # respect time filtering + if args.activity_since_duration: + if repo.updated_at > activityDate: + findings.append(create_finding_github(repo)) + respect_github_ratelimit(args, gh) + else: + logger.info(f' Reached activity limit! Ignoring all repos with latest activity since `{activityDeltaDatetime}` ago ({ str(activityDate) }).') + break + elif args.activity_until_duration: + if repo.updated_at < activityDate: + findings.append(create_finding_github(repo)) + respect_github_ratelimit(args, gh) + else: + logger.info(f' Reached activity limit! Ignoring all repos with latest activity until `{activityDeltaDatetime}` ago ({ str(activityDate) }).') + break + else: + findings.append(create_finding_github(repo)) + respect_github_ratelimit(args, gh) def setup_github(args): if args.url: diff --git a/scanners/git-repo-scanner/scanner/git_repo_scanner_test.py b/scanners/git-repo-scanner/scanner/git_repo_scanner_test.py index 71f2247039..402fb5fb09 100644 --- a/scanners/git-repo-scanner/scanner/git_repo_scanner_test.py +++ b/scanners/git-repo-scanner/scanner/git_repo_scanner_test.py @@ -1,4 +1,5 @@ import datetime +from datetime import timezone import unittest import git_repo_scanner from munch import Munch @@ -13,7 +14,7 @@ def test_process_gitlab_projects_with_no_ignore_list(self): projects = assemble_projects() args = get_args() # when - findings = git_repo_scanner.process_gitlab_projects(args, projects) + findings = git_repo_scanner.process_gitlab_projects(args, projects, 0, datetime.datetime.now()) # then self.assertEqual(3, len(findings), msg='There should be exactly 3 findings') self.assertEqual(findings[0]['name'], 'GitLab Repo', msg='Test finding output') @@ -25,7 +26,7 @@ def test_process_gitlab_projects_with_ignore_group(self): projects = assemble_projects() args = get_args(ignore_groups=33) # when - findings = git_repo_scanner.process_gitlab_projects(args, projects) + findings = git_repo_scanner.process_gitlab_projects(args, projects, 0, datetime.datetime.now()) # then self.assertEqual(2, len(findings), msg='There should be exactly 2 findings') self.assertEqual(findings[0]['attributes']['web_url'], 'url1', msg='Test finding output') @@ -36,7 +37,7 @@ def test_process_gitlab_projects_with_ignore_project(self): projects = assemble_projects() args = get_args(ignore_projects=1) # when - findings = git_repo_scanner.process_gitlab_projects(args, projects) + findings = git_repo_scanner.process_gitlab_projects(args, projects, 0, datetime.datetime.now()) # then self.assertEqual(2, len(findings), msg='There should be exactly 2 findings') self.assertEqual(findings[0]['attributes']['web_url'], 'url2', msg='Test finding output') @@ -118,14 +119,16 @@ def create_mocks(github_mock, org_mock, pag_mock, repos): def assemble_projects(): - project1 = assemble_project(p_id=1, name='name1', url='url1', path='path1', date_created='10.10.2020', - date_updated='10.11.2020', visibility='private', o_id=11, o_kind='group', + created = datetime.datetime(2020, 10, 10, tzinfo=timezone.utc).isoformat() + updated = datetime.datetime(2020, 11, 10, tzinfo=timezone.utc).isoformat() + project1 = assemble_project(p_id=1, name='name1', url='url1', path='path1', date_created=created, + date_updated=updated, visibility='private', o_id=11, o_kind='group', o_name='name11') - project2 = assemble_project(p_id=2, name='name2', url='url2', path='path2', date_created='10.10.2020', - date_updated='10.11.2020', visibility='private', o_id=22, o_kind='user', + project2 = assemble_project(p_id=2, name='name2', url='url2', path='path2', date_created=created, + date_updated=updated, visibility='private', o_id=22, o_kind='user', o_name='name22') - project3 = assemble_project(p_id=3, name='name3', url='url3', path='path3', date_created='10.10.2020', - date_updated='10.11.2020', visibility='private', o_id=33, o_kind='group', + project3 = assemble_project(p_id=3, name='name3', url='url3', path='path3', date_created=created, + date_updated=updated, visibility='private', o_id=33, o_kind='group', o_name='name33') return [project1, project2, project3] @@ -148,20 +151,20 @@ def assemble_project(p_id, name, url, path, date_created, date_updated, visibili def assemble_repos(): - date = datetime.datetime(2020, 5, 17) + date = datetime.datetime(2020, 5, 17, tzinfo=timezone.utc) project1 = assemble_repository(p_id=1, name='name1', url='url1', path='path1', date_created=date, - date_updated=date, visibility=True, o_id=11, o_kind='organization', + date_updated=date, date_pushed=date, visibility=True, o_id=11, o_kind='organization', o_name='name11') project2 = assemble_repository(p_id=2, name='name2', url='url2', path='path2', date_created=date, - date_updated=date, visibility=False, o_id=22, o_kind='organization', + date_updated=date, date_pushed=date, visibility=False, o_id=22, o_kind='organization', o_name='name22') project3 = assemble_repository(p_id=3, name='name3', url='url3', path='path3', date_created=date, - date_updated=date, visibility=False, o_id=33, o_kind='organization', + date_updated=date, date_pushed=date, visibility=False, o_id=33, o_kind='organization', o_name='name33') return [project1, project2, project3] -def assemble_repository(p_id, name, url, path, date_created: datetime, date_updated: datetime, visibility: bool, o_id, +def assemble_repository(p_id, name, url, path, date_created: datetime, date_updated: datetime, date_pushed: datetime, visibility: bool, o_id, o_kind, o_name): repo = Munch() repo.id = p_id @@ -169,6 +172,7 @@ def assemble_repository(p_id, name, url, path, date_created: datetime, date_upda repo.html_url = url repo.full_name = path repo.created_at = date_created + repo.pushed_at = date_pushed repo.updated_at = date_updated repo.private = visibility repo.owner = Munch(type=o_kind, id=o_id, name=o_name) diff --git a/scanners/git-repo-scanner/scanner/requirements.txt b/scanners/git-repo-scanner/scanner/requirements.txt index 95198b0130..110b788b82 100644 --- a/scanners/git-repo-scanner/scanner/requirements.txt +++ b/scanners/git-repo-scanner/scanner/requirements.txt @@ -2,3 +2,5 @@ PyGithub == 1.54.1 python-gitlab == 2.6.0 munch == 2.5.0 mock == 4.0.2 +pytimeparse == 1.1.8 +pytz == 2021.1 From b1ac75121b0072f3fc2111678982b1bc33d9c4f2 Mon Sep 17 00:00:00 2001 From: rseedorff Date: Sun, 14 Mar 2021 01:40:36 +0000 Subject: [PATCH 4/9] Updating Helm Docs --- scanners/git-repo-scanner/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scanners/git-repo-scanner/README.md b/scanners/git-repo-scanner/README.md index ebcd88b534..10258cc286 100644 --- a/scanners/git-repo-scanner/README.md +++ b/scanners/git-repo-scanner/README.md @@ -37,6 +37,10 @@ For type GitHub you can use the following options: - `--access-token`: Your personal GitHub access token. - `--ignore-repos`: A list of GitHub repository ids you want to ignore - `--obey-rate-limit`: True to obey the rate limit of the GitHub server (default), otherwise False +- `--activity-since-duration`: Return git repo findings with repo activity (e.g. commits) more recent than a specific date expresed by an duration (now + duration). A duration string is a possibly signed sequence of decimal numbers, each + with optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm', 'h', 'd', 'w'. +- `--activity-until-duration`: Return git repo findings with repo activity (e.g. commits) older than a specific date expresed by an duration (now + duration). A duration string is a possibly signed sequence of decimal numbers, each with + optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm', 'h', 'd', 'w'. For now only organizations are supported so the option is mandatory. We **strongly recommend** providing an access token for authentication. If not provided the rate limiting will kick in after about 30 repositories scanned. @@ -49,6 +53,10 @@ For type GitLab you can use the following options: - `--ignore-groups`: A list of GitLab group ids you want to ignore - `--ignore-repos`: A list of GitLab project ids you want to ignore - `--obey-rate-limit`: True to obey the rate limit of the GitLab server (default), otherwise False +- `--activity-since-duration`: Return git repo findings with repo activity (e.g. commits) more recent than a specific date expresed by an duration (now + duration). A duration string is a possibly signed sequence of decimal numbers, each + with optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm', 'h', 'd', 'w'. +- `--activity-until-duration`: Return git repo findings with repo activity (e.g. commits) older than a specific date expresed by an duration (now + duration). A duration string is a possibly signed sequence of decimal numbers, each with + optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm', 'h', 'd', 'w'. For gitlab the url and the access token is mandatory. If you don't provide a specific group id all projects on the gitlab server are going to be discovered. From 22aea9e4586cb8bcdc822c5f95050d59e64ef8d4 Mon Sep 17 00:00:00 2001 From: Robert Seedorff Date: Sun, 14 Mar 2021 20:26:14 +0100 Subject: [PATCH 5/9] Fixing missing python requirements bug in docker image. --- .github/workflows/ci.yaml | 2 +- scanners/git-repo-scanner/scanner/.dockerignore | 3 +++ scanners/git-repo-scanner/scanner/Dockerfile | 4 ++-- .../{git_repo_scanner_test.py => git_repo_scanner.test.py} | 0 4 files changed, 6 insertions(+), 3 deletions(-) create mode 100644 scanners/git-repo-scanner/scanner/.dockerignore rename scanners/git-repo-scanner/scanner/{git_repo_scanner_test.py => git_repo_scanner.test.py} (100%) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 65bc690705..b0c2a078d4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -41,7 +41,7 @@ jobs: working-directory: scanners/git-repo-scanner/scanner/ run: | pip install pytest - pytest ${{ matrix.unit }}_test.py + pytest ${{ matrix.unit }}.test.py # ---- Unit-Test | JavaScript ---- diff --git a/scanners/git-repo-scanner/scanner/.dockerignore b/scanners/git-repo-scanner/scanner/.dockerignore new file mode 100644 index 0000000000..bf3dcb3752 --- /dev/null +++ b/scanners/git-repo-scanner/scanner/.dockerignore @@ -0,0 +1,3 @@ +__pytest_cache +.pytest_cache +*.test.py diff --git a/scanners/git-repo-scanner/scanner/Dockerfile b/scanners/git-repo-scanner/scanner/Dockerfile index 48d3de19b1..50ccc700a1 100644 --- a/scanners/git-repo-scanner/scanner/Dockerfile +++ b/scanners/git-repo-scanner/scanner/Dockerfile @@ -1,5 +1,5 @@ FROM python:3.9.0-alpine -COPY git_repo_scanner.py /scripts/git_repo_scanner.py -RUN pip install PyGithub python-gitlab +COPY . /scripts/ +RUN pip install -r /scripts/requirements.txt CMD ["/bin/sh"] ENTRYPOINT ["python","/scripts/git_repo_scanner.py"] diff --git a/scanners/git-repo-scanner/scanner/git_repo_scanner_test.py b/scanners/git-repo-scanner/scanner/git_repo_scanner.test.py similarity index 100% rename from scanners/git-repo-scanner/scanner/git_repo_scanner_test.py rename to scanners/git-repo-scanner/scanner/git_repo_scanner.test.py From 0b87d3a401f152d52b973456689c7455a83929b2 Mon Sep 17 00:00:00 2001 From: Robert Seedorff Date: Sun, 14 Mar 2021 20:31:12 +0100 Subject: [PATCH 6/9] Fixing failed pytest. --- .github/workflows/ci.yaml | 2 +- scanners/git-repo-scanner/scanner/.dockerignore | 2 +- .../{git_repo_scanner.test.py => git_repo_scanner_test.py} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename scanners/git-repo-scanner/scanner/{git_repo_scanner.test.py => git_repo_scanner_test.py} (100%) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b0c2a078d4..65bc690705 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -41,7 +41,7 @@ jobs: working-directory: scanners/git-repo-scanner/scanner/ run: | pip install pytest - pytest ${{ matrix.unit }}.test.py + pytest ${{ matrix.unit }}_test.py # ---- Unit-Test | JavaScript ---- diff --git a/scanners/git-repo-scanner/scanner/.dockerignore b/scanners/git-repo-scanner/scanner/.dockerignore index bf3dcb3752..f88f2f169d 100644 --- a/scanners/git-repo-scanner/scanner/.dockerignore +++ b/scanners/git-repo-scanner/scanner/.dockerignore @@ -1,3 +1,3 @@ __pytest_cache .pytest_cache -*.test.py +*_test.py diff --git a/scanners/git-repo-scanner/scanner/git_repo_scanner.test.py b/scanners/git-repo-scanner/scanner/git_repo_scanner_test.py similarity index 100% rename from scanners/git-repo-scanner/scanner/git_repo_scanner.test.py rename to scanners/git-repo-scanner/scanner/git_repo_scanner_test.py From a3a777ccb604c5fdaa8eb6cb3e1f89b9c55b4efa Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 26 Mar 2021 09:45:42 +0100 Subject: [PATCH 7/9] git-repo-scanner refactor --- scanners/git-repo-scanner/README.md | 26 +- .../scanner/git_repo_scanner.py | 631 +++++++++--------- .../scanner/git_repo_scanner/__init__.py | 0 .../git_repo_scanner/abstract_scanner.py | 0 4 files changed, 343 insertions(+), 314 deletions(-) create mode 100644 scanners/git-repo-scanner/scanner/git_repo_scanner/__init__.py create mode 100644 scanners/git-repo-scanner/scanner/git_repo_scanner/abstract_scanner.py diff --git a/scanners/git-repo-scanner/README.md b/scanners/git-repo-scanner/README.md index 10258cc286..f305bb2dd4 100644 --- a/scanners/git-repo-scanner/README.md +++ b/scanners/git-repo-scanner/README.md @@ -37,10 +37,14 @@ For type GitHub you can use the following options: - `--access-token`: Your personal GitHub access token. - `--ignore-repos`: A list of GitHub repository ids you want to ignore - `--obey-rate-limit`: True to obey the rate limit of the GitHub server (default), otherwise False -- `--activity-since-duration`: Return git repo findings with repo activity (e.g. commits) more recent than a specific date expresed by an duration (now + duration). A duration string is a possibly signed sequence of decimal numbers, each - with optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm', 'h', 'd', 'w'. -- `--activity-until-duration`: Return git repo findings with repo activity (e.g. commits) older than a specific date expresed by an duration (now + duration). A duration string is a possibly signed sequence of decimal numbers, each with - optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm', 'h', 'd', 'w'. +- `--activity-since-duration`: Return git repo findings with repo activity (e.g. commits) more recent than a specific + date expressed by a duration (now - duration). A duration string is a possibly signed sequence of decimal numbers, each + with an optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units + are 'm', 'h', 'd', 'w'. +- `--activity-until-duration`: Return git repo findings with repo activity (e.g. commits) older than a specific date + expressed by a duration (now - duration). A duration string is a possibly signed sequence of decimal numbers, each with + an optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm + ', 'h', 'd', 'w'. For now only organizations are supported so the option is mandatory. We **strongly recommend** providing an access token for authentication. If not provided the rate limiting will kick in after about 30 repositories scanned. @@ -49,14 +53,18 @@ for authentication. If not provided the rate limiting will kick in after about 3 For type GitLab you can use the following options: - `--url`: The url of the GitLab server. - `--access-token`: Your personal GitLab access token. -- `--group`: A specific GitLab group id you want to san, including subgroups. +- `--group`: A specific GitLab group id you want to scan, including subgroups. - `--ignore-groups`: A list of GitLab group ids you want to ignore - `--ignore-repos`: A list of GitLab project ids you want to ignore - `--obey-rate-limit`: True to obey the rate limit of the GitLab server (default), otherwise False -- `--activity-since-duration`: Return git repo findings with repo activity (e.g. commits) more recent than a specific date expresed by an duration (now + duration). A duration string is a possibly signed sequence of decimal numbers, each - with optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm', 'h', 'd', 'w'. -- `--activity-until-duration`: Return git repo findings with repo activity (e.g. commits) older than a specific date expresed by an duration (now + duration). A duration string is a possibly signed sequence of decimal numbers, each with - optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm', 'h', 'd', 'w'. +- `--activity-since-duration`: Return git repo findings with repo activity (e.g. commits) more recent than a specific + date expressed by a duration (now - duration). A duration string is a possibly signed sequence of decimal numbers, each + with an optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units + are 'm', 'h', 'd', 'w'. +- `--activity-until-duration`: Return git repo findings with repo activity (e.g. commits) older than a specific date + expressed by a duration (now - duration). A duration string is a possibly signed sequence of decimal numbers, each with + an optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm + ', 'h', 'd', 'w'. For gitlab the url and the access token is mandatory. If you don't provide a specific group id all projects on the gitlab server are going to be discovered. diff --git a/scanners/git-repo-scanner/scanner/git_repo_scanner.py b/scanners/git-repo-scanner/scanner/git_repo_scanner.py index e762690ad9..dbd692994c 100644 --- a/scanners/git-repo-scanner/scanner/git_repo_scanner.py +++ b/scanners/git-repo-scanner/scanner/git_repo_scanner.py @@ -28,378 +28,399 @@ def main(): - args = get_parser_args() + args = get_parser_args() - findings = process(args) + findings = process(args) - logger.info(' Write findings to file...') - write_findings_to_file(args, findings) - logger.info(' Finished!') + logger.info(' Write findings to file...') + write_findings_to_file(args, findings) + logger.info(' Finished!') def process(args): - if args.git_type == 'gitlab': - return process_gitlab(args) - else: - return process_github(args) + if args.git_type == 'gitlab': + return process_gitlab(args) + else: + return process_github(args) def process_github(args): - try: - return parse_github(args) - except github.GithubException as e: - logger.info(f' Github API Exception: {e.status} -> {e.data["message"]}') - sys.exit(-1) + try: + return parse_github(args) + except github.GithubException as e: + logger.info(f' Github API Exception: {e.status} -> {e.data["message"]}') + sys.exit(-1) def process_gitlab(args): - try: - return parse_gitlab(args) - except gitlab.GitlabError as e: - logger.info(f' Gitlab API Exception: {e}') - sys.exit(-1) + try: + return parse_gitlab(args) + except gitlab.GitlabError as e: + logger.info(f' Gitlab API Exception: {e}') + sys.exit(-1) def write_findings_to_file(args, findings): - Path(args.file_output).mkdir(parents=True, exist_ok=True) - with open(f'{args.file_output}/git-repo-scanner-findings.json', 'w') as out: - json.dump(findings, out) + Path(args.file_output).mkdir(parents=True, exist_ok=True) + with open(f'{args.file_output}/git-repo-scanner-findings.json', 'w') as out: + json.dump(findings, out) def get_parser_args(args=None): - parser = argparse.ArgumentParser(description='Scan public or private git repositories of organizations or groups') - parser.add_argument('--git-type', - help='Repository type can be github or GitLab', - choices=['github', 'gitlab'], - required=True) - parser.add_argument('--file-output', - help='The path of the output file', - required=True), - parser.add_argument('--url', help='The GitLab url or a GitHub enterprise api url.', - required=False) - parser.add_argument('--access-token', - help='An access token for authentication', - required=False) - parser.add_argument('--organization', - help='The name of the GitHub organization to scan', - required=False) - parser.add_argument('--group', - help='The id of the GitLab group to scan', - required=False) - parser.add_argument('--ignore-repos', - help='A list of repo ids to ignore', - action='extend', - nargs='+', - type=int, - default=[], - required=False) - parser.add_argument('--ignore-groups', - help='A list of GitLab group ids to ignore', - action='extend', - nargs='+', - type=int, - default=[], - required=False) - parser.add_argument('--obey-rate-limit', - help='True to obey the rate limit of the GitLab or GitHub server (default), otherwise False', - type=bool, - default=True, - required=False) - parser.add_argument('--activity-since-duration', - help='Return git repo findings with repo activity (e.g. commits) more recent than a specific date expresed by an duration (now + duration)', - type=str, - required=False) - parser.add_argument('--activity-until-duration', - help='Return git repo findings with repo activity (e.g. commits) older than a specific date expresed by an duration (now + duration)', - type=str, - required=False) - - if args: - return parser.parse_args(args) - else: - return parser.parse_args() + parser = argparse.ArgumentParser(description='Scan public or private git repositories of organizations or groups') + parser.add_argument('--git-type', + help='Repository type can be github or GitLab', + choices=['github', 'gitlab'], + required=True) + parser.add_argument('--file-output', + help='The path of the output file', + required=True), + parser.add_argument('--url', help='The GitLab url or a GitHub enterprise api url.', + required=False) + parser.add_argument('--access-token', + help='An access token for authentication', + required=False) + parser.add_argument('--organization', + help='The name of the GitHub organization to scan', + required=False) + parser.add_argument('--group', + help='The id of the GitLab group to scan', + required=False) + parser.add_argument('--ignore-repos', + help='A list of repo ids to ignore', + action='extend', + nargs='+', + type=int, + default=[], + required=False) + parser.add_argument('--ignore-groups', + help='A list of GitLab group ids to ignore', + action='extend', + nargs='+', + type=int, + default=[], + required=False) + parser.add_argument('--obey-rate-limit', + help='True to obey the rate limit of the GitLab or GitHub server (default), otherwise False', + type=bool, + default=True, + required=False) + parser.add_argument('--activity-since-duration', + help='Return git repo findings with repo activity (e.g. commits) more recent than a specific date expresed by an duration (now + duration)', + type=str, + required=False) + parser.add_argument('--activity-until-duration', + help='Return git repo findings with repo activity (e.g. commits) older than a specific date expresed by an duration (now + duration)', + type=str, + required=False) + + if args: + return parser.parse_args(args) + else: + return parser.parse_args() def parse_gitlab(args): - gl: gitlab.Gitlab - if not args.url: - logger.info(' URL required for GitLab connection.') - sys.exit(-1) - - logger.info(' Gitlab authentication...') - gl = gitlab_authenticate(args) - - logger.info(' Gitlab retrieve all repositories...') - now_utc = pytz.utc.localize(datetime.utcnow()) - # Respect time filtering based on "pushed_at" (not "updated_at") - # The difference is that "pushed_at" represents the date and time of the last commit, whereas the "updated_at" represents the date and time of the last change the the repository. - # A change to the repository might be a commit, but it may also be other things, such as changing the description of the repo, creating wiki pages, etc. - # In other words, commits are a subset of updates, and the pushed_at timestamp will therefore either be the same as the updated_at timestamp, or it will be an earlier timestamp. - duration = 0 - activityDeltaDatetime = now_utc - if args.activity_since_duration: - activityDuration = timeparse(args.activity_since_duration) - activityDeltaDatetime = timedelta(seconds=activityDuration) - logger.info(' Get all GitLab Repos (filtered by last activity since '+ str(activityDeltaDatetime) +' ago.)') - - projects: List[Project] = get_gitlab_projects_active_since(args, gl) - elif args.activity_until_duration: - activityDuration = timeparse(args.activity_until_duration) - activityDeltaDatetime = timedelta(seconds=activityDuration) - logger.info(' Get all GitLab Repos (filtered by last activity until '+ str(activityDeltaDatetime) +' ago.)') - - projects: List[Project] = get_gitlab_projects_active_until(args, gl) - else: - logger.info(' Get all Gitlab Repos (not filtered)') - projects: List[Project] = get_gitlab_projects_all(args, gl) - - logger.info(' Process Projects...') - activityDate = now_utc - activityDeltaDatetime - findings = process_gitlab_projects(args, projects, activityDeltaDatetime, activityDate) - - return findings + gl: gitlab.Gitlab + if not args.url: + logger.info(' URL required for GitLab connection.') + sys.exit(-1) + + logger.info(' Gitlab authentication...') + gl = gitlab_authenticate(args) + + logger.info(' Gitlab retrieve all repositories...') + now_utc = pytz.utc.localize(datetime.utcnow()) + # Respect time filtering based on "pushed_at" (not "updated_at") + # The difference is that "pushed_at" represents the date and time of the last commit, whereas the "updated_at" represents the date and time of the last change the the repository. + # A change to the repository might be a commit, but it may also be other things, such as changing the description of the repo, creating wiki pages, etc. + # In other words, commits are a subset of updates, and the pushed_at timestamp will therefore either be the same as the updated_at timestamp, or it will be an earlier timestamp. + duration = 0 + activityDeltaDatetime = now_utc + if args.activity_since_duration: + activityDuration = timeparse(args.activity_since_duration) + activityDeltaDatetime = timedelta(seconds=activityDuration) + logger.info(' Get all GitLab Repos (filtered by last activity since ' + str(activityDeltaDatetime) + ' ago.)') + + projects: List[Project] = get_gitlab_projects_active_since(args, gl) + elif args.activity_until_duration: + activityDuration = timeparse(args.activity_until_duration) + activityDeltaDatetime = timedelta(seconds=activityDuration) + logger.info(' Get all GitLab Repos (filtered by last activity until ' + str(activityDeltaDatetime) + ' ago.)') + + projects: List[Project] = get_gitlab_projects_active_until(args, gl) + else: + logger.info(' Get all Gitlab Repos (not filtered)') + projects: List[Project] = get_gitlab_projects_all(args, gl) + + logger.info(' Process Projects...') + activityDate = now_utc - activityDeltaDatetime + findings = process_gitlab_projects(args, projects, activityDeltaDatetime, activityDate) + + return findings def process_gitlab_projects(args, projects, activityDeltaDatetime, activityDate): - findings = [] - i = 1 - for project in projects: - if is_not_on_ignore_list_gitlab(project, args.ignore_groups, args.ignore_repos): - lastUpDatetime = datetime.fromisoformat(project.last_activity_at) - logger.info(f' {i} - Name: {project.name} - LastUpdate: {lastUpDatetime}') - i += 1 - - # respect time filtering - if args.activity_since_duration: - if lastUpDatetime > activityDate: - findings.append(create_finding_gitlab(project)) - else: - logger.info(f' Reached activity limit! Ignoring all repos with latest activity since `{activityDeltaDatetime}` ago ({ str(activityDate) }).') - break - elif args.activity_until_duration: - if lastUpDatetime < activityDate: - findings.append(create_finding_gitlab(project)) - else: - logger.info(f' Reached activity limit! Ignoring all repos with latest activity until `{activityDeltaDatetime}` ago ({ str(activityDate) }).') - break - else: - findings.append(create_finding_gitlab(project)) - - return findings + findings = [] + i = 1 + for project in projects: + if is_not_on_ignore_list_gitlab(project, args.ignore_groups, args.ignore_repos): + lastUpDatetime = datetime.fromisoformat(project.last_activity_at) + logger.info(f' {i} - Name: {project.name} - LastUpdate: {lastUpDatetime}') + i += 1 + + # respect time filtering + if args.activity_since_duration: + if lastUpDatetime > activityDate: + findings.append(create_finding_gitlab(project)) + else: + logger.info( + f' Reached activity limit! Ignoring all repos with latest activity since `{activityDeltaDatetime}` ago ({str(activityDate)}).') + break + elif args.activity_until_duration: + if lastUpDatetime < activityDate: + findings.append(create_finding_gitlab(project)) + else: + logger.info( + f' Reached activity limit! Ignoring all repos with latest activity until `{activityDeltaDatetime}` ago ({str(activityDate)}).') + break + else: + findings.append(create_finding_gitlab(project)) + + return findings def get_gitlab_projects_all(args, gl): - if args.group: - try: - projects = gl.groups.get(args.group).projects.list(all=True, include_subgroups=True, obey_rate_limit=args.obey_rate_limit) - except gitlab.exceptions.GitlabGetError: - logger.info(' Group does not exist.') - sys.exit(-1) - else: - projects = gl.projects.list(all=True, max_retries=12, obey_rate_limit=args.obey_rate_limit) - return projects + if args.group: + try: + projects = gl.groups.get(args.group).projects.list(all=True, include_subgroups=True, + obey_rate_limit=args.obey_rate_limit) + except gitlab.exceptions.GitlabGetError: + logger.info(' Group does not exist.') + sys.exit(-1) + else: + projects = gl.projects.list(all=True, max_retries=12, obey_rate_limit=args.obey_rate_limit) + return projects + def get_gitlab_projects_active_since(args, gl): - if args.group: - try: - projects = gl.groups.get(args.group).projects.list(all=True, include_subgroups=True, order_by='last_activity_at', sort='desc', obey_rate_limit=args.obey_rate_limit) - except gitlab.exceptions.GitlabGetError: - logger.info(' Group does not exist.') - sys.exit(-1) - else: - projects = gl.projects.list(all=True, max_retries=12, order_by='last_activity_at', sort='desc', obey_rate_limit=args.obey_rate_limit) - return projects + if args.group: + try: + projects = gl.groups.get(args.group).projects.list(all=True, include_subgroups=True, + order_by='last_activity_at', + sort='desc', obey_rate_limit=args.obey_rate_limit) + except gitlab.exceptions.GitlabGetError: + logger.info(' Group does not exist.') + sys.exit(-1) + else: + projects = gl.projects.list(all=True, max_retries=12, order_by='last_activity_at', sort='desc', + obey_rate_limit=args.obey_rate_limit) + return projects + def get_gitlab_projects_active_until(args, gl): - if args.group: - try: - projects = gl.groups.get(args.group).projects.list(all=True, include_subgroups=True, order_by='last_activity_at', sort='asc', obey_rate_limit=args.obey_rate_limit) - except gitlab.exceptions.GitlabGetError: - logger.info(' Group does not exist.') - sys.exit(-1) - else: - projects = gl.projects.list(all=True, max_retries=12, order_by='last_activity_at', sort='asc', obey_rate_limit=args.obey_rate_limit) - return projects + if args.group: + try: + projects = gl.groups.get(args.group).projects.list(all=True, include_subgroups=True, + order_by='last_activity_at', + sort='asc', obey_rate_limit=args.obey_rate_limit) + except gitlab.exceptions.GitlabGetError: + logger.info(' Group does not exist.') + sys.exit(-1) + else: + projects = gl.projects.list(all=True, max_retries=12, order_by='last_activity_at', sort='asc', + obey_rate_limit=args.obey_rate_limit) + return projects def gitlab_authenticate(args): - gl: gitlab.Gitlab - if args.access_token: - try: - gl = gitlab.Gitlab(args.url, args.access_token) - gl.auth() - except gitlab.exceptions.GitlabAuthenticationError: - gl = gitlab_authenticate_oauth(args) - else: - logger.info(' Access token required for GitLab authentication.') - sys.exit(-1) - logger.info(' Success') - return gl + gl: gitlab.Gitlab + if args.access_token: + try: + gl = gitlab.Gitlab(args.url, args.access_token) + gl.auth() + except gitlab.exceptions.GitlabAuthenticationError: + gl = gitlab_authenticate_oauth(args) + else: + logger.info(' Access token required for GitLab authentication.') + sys.exit(-1) + logger.info(' Success') + return gl def gitlab_authenticate_oauth(args): - try: - gl = gitlab.Gitlab(args.url, oauth_token=args.access_token) - gl.auth() - except gitlab.exceptions.GitlabAuthenticationError: - logger.info(' No permission. Check your access token.') - sys.exit(-1) - return gl + try: + gl = gitlab.Gitlab(args.url, oauth_token=args.access_token) + gl.auth() + except gitlab.exceptions.GitlabAuthenticationError: + logger.info(' No permission. Check your access token.') + sys.exit(-1) + return gl def parse_github(args): - gh: github.Github = setup_github(args) + gh: github.Github = setup_github(args) - logger.info(' Process Repositories...') + logger.info(' Process Repositories...') + + if args.organization: + findings = process_github_repos(args, gh) + return findings + else: + logger.info(' No organization provided') + sys.exit(-1) - if args.organization: - findings = process_github_repos(args, gh) - return findings - else: - logger.info(' No organization provided') - sys.exit(-1) def respect_github_ratelimit(args, gh): - if args.obey_rate_limit: - api_limit = gh.get_rate_limit().core - reset_timestamp = calendar.timegm(api_limit.reset.timetuple()) - seconds_until_reset = reset_timestamp - calendar.timegm(time.gmtime()) + 5 # add 5 seconds to be sure the rate limit has been reset - sleep_time = seconds_until_reset / api_limit.remaining + if args.obey_rate_limit: + api_limit = gh.get_rate_limit().core + reset_timestamp = calendar.timegm(api_limit.reset.timetuple()) + seconds_until_reset = reset_timestamp - calendar.timegm( + time.gmtime()) + 5 # add 5 seconds to be sure the rate limit has been reset + sleep_time = seconds_until_reset / api_limit.remaining + + logger.info(' Checking Rate-Limit (' + str(args.obey_rate_limit) + ') [remainingApiCalls: ' + str( + api_limit.remaining) + ', seconds_until_reset: ' + str(seconds_until_reset) + ', sleepTime: ' + str( + sleep_time) + ']') + time.sleep(sleep_time) - logger.info(' Checking Rate-Limit ('+ str(args.obey_rate_limit) +') [remainingApiCalls: ' + str(api_limit.remaining) + ', seconds_until_reset: ' + str(seconds_until_reset) + ', sleepTime: ' + str(sleep_time) + ']') - time.sleep(sleep_time) def process_github_repos(args, gh): - findings = [] - org: Organization = gh.get_organization(args.organization) - - # Respect time filtering based on "pushed_at" (not "updated_at") - # The difference is that "pushed_at" represents the date and time of the last commit, whereas the "updated_at" represents the date and time of the last change the the repository. - # A change to the repository might be a commit, but it may also be other things, such as changing the description of the repo, creating wiki pages, etc. - # In other words, commits are a subset of updates, and the pushed_at timestamp will therefore either be the same as the updated_at timestamp, or it will be an earlier timestamp. - duration = 0 - activityDeltaDatetime = datetime.now() - if args.activity_since_duration: - activityDuration = timeparse(args.activity_since_duration) - activityDeltaDatetime = timedelta(seconds=activityDuration) - logger.info(' Get all GitHub Repos (filtered by last activity since '+ str(activityDeltaDatetime) +' ago.)') - - repos: PaginatedList[Repository] = org.get_repos(type='all', sort='pushed', direction='desc') - elif args.activity_until_duration: - activityDuration = timeparse(args.activity_until_duration) - activityDeltaDatetime = timedelta(seconds=activityDuration) - logger.info(' Get all GitHub Repos (filtered by last activity until '+ str(activityDeltaDatetime) +' ago.)') - - repos: PaginatedList[Repository] = org.get_repos(type='all', sort='pushed', direction='asc') - else: - logger.info(' Get all GitHub Repos (not filtered)') - repos: PaginatedList[Repository] = org.get_repos(type='all') - - activityDate = datetime.now() - activityDeltaDatetime - - for i in range(repos.totalCount): - process_github_repos_page(args, findings, repos.get_page(i), gh, activityDeltaDatetime, activityDate) - return findings + findings = [] + org: Organization = gh.get_organization(args.organization) + + # Respect time filtering based on "pushed_at" (not "updated_at") + # The difference is that "pushed_at" represents the date and time of the last commit, whereas the "updated_at" represents the date and time of the last change the the repository. + # A change to the repository might be a commit, but it may also be other things, such as changing the description of the repo, creating wiki pages, etc. + # In other words, commits are a subset of updates, and the pushed_at timestamp will therefore either be the same as the updated_at timestamp, or it will be an earlier timestamp. + duration = 0 + activityDeltaDatetime = datetime.now() + if args.activity_since_duration: + activityDuration = timeparse(args.activity_since_duration) + activityDeltaDatetime = timedelta(seconds=activityDuration) + logger.info(' Get all GitHub Repos (filtered by last activity since ' + str(activityDeltaDatetime) + ' ago.)') + + repos: PaginatedList[Repository] = org.get_repos(type='all', sort='pushed', direction='desc') + elif args.activity_until_duration: + activityDuration = timeparse(args.activity_until_duration) + activityDeltaDatetime = timedelta(seconds=activityDuration) + logger.info(' Get all GitHub Repos (filtered by last activity until ' + str(activityDeltaDatetime) + ' ago.)') + + repos: PaginatedList[Repository] = org.get_repos(type='all', sort='pushed', direction='asc') + else: + logger.info(' Get all GitHub Repos (not filtered)') + repos: PaginatedList[Repository] = org.get_repos(type='all') + + activityDate = datetime.now() - activityDeltaDatetime + + for i in range(repos.totalCount): + process_github_repos_page(args, findings, repos.get_page(i), gh, activityDeltaDatetime, activityDate) + return findings + def process_github_repos_page(args, findings, repos, gh, activityDeltaDatetime, activityDate): - repo: Repository - for repo in repos: - if repo.id not in args.ignore_repos: - logger.info(f' {len(findings) + 1} - Name: {repo.name} - LastUpdate: {repo.updated_at} - LastPush: {repo.pushed_at}') - - # respect time filtering - if args.activity_since_duration: - if repo.updated_at > activityDate: - findings.append(create_finding_github(repo)) - respect_github_ratelimit(args, gh) - else: - logger.info(f' Reached activity limit! Ignoring all repos with latest activity since `{activityDeltaDatetime}` ago ({ str(activityDate) }).') - break - elif args.activity_until_duration: - if repo.updated_at < activityDate: - findings.append(create_finding_github(repo)) - respect_github_ratelimit(args, gh) - else: - logger.info(f' Reached activity limit! Ignoring all repos with latest activity until `{activityDeltaDatetime}` ago ({ str(activityDate) }).') - break - else: - findings.append(create_finding_github(repo)) - respect_github_ratelimit(args, gh) + repo: Repository + for repo in repos: + if repo.id not in args.ignore_repos: + logger.info( + f' {len(findings) + 1} - Name: {repo.name} - LastUpdate: {repo.updated_at} - LastPush: {repo.pushed_at}') + + # respect time filtering + if args.activity_since_duration: + if repo.updated_at > activityDate: + findings.append(create_finding_github(repo)) + respect_github_ratelimit(args, gh) + else: + logger.info( + f' Reached activity limit! Ignoring all repos with latest activity since `{activityDeltaDatetime}` ago ({str(activityDate)}).') + break + elif args.activity_until_duration: + if repo.updated_at < activityDate: + findings.append(create_finding_github(repo)) + respect_github_ratelimit(args, gh) + else: + logger.info( + f' Reached activity limit! Ignoring all repos with latest activity until `{activityDeltaDatetime}` ago ({str(activityDate)}).') + break + else: + findings.append(create_finding_github(repo)) + respect_github_ratelimit(args, gh) + def setup_github(args): - if args.url: - return setup_github_with_url(args) - else: - return setup_github_without_url(args) + if args.url: + return setup_github_with_url(args) + else: + return setup_github_without_url(args) def setup_github_without_url(args): - if args.access_token: - return github.Github(args.access_token) - else: - return github.Github() + if args.access_token: + return github.Github(args.access_token) + else: + return github.Github() def setup_github_with_url(args): - if args.access_token: - return github.Github(base_url=args.url, login_or_token=args.access_token) - else: - logger.info(' Access token required for github enterprise authentication.') - sys.exit(-1) + if args.access_token: + return github.Github(base_url=args.url, login_or_token=args.access_token) + else: + logger.info(' Access token required for github enterprise authentication.') + sys.exit(-1) def is_not_on_ignore_list_gitlab(project: Project, groups: List, repos: List): - id_project = project.id - kind = project.namespace['kind'] - id_namespace = project.namespace['id'] - if id_project in repos: - return False - if kind == 'group' and id_namespace in groups: - return False - return True + id_project = project.id + kind = project.namespace['kind'] + id_namespace = project.namespace['id'] + if id_project in repos: + return False + if kind == 'group' and id_namespace in groups: + return False + return True def create_finding_gitlab(project: Project): - return { - 'name': 'GitLab Repo', - 'description': 'A GitLab repository', - 'category': 'Git Repository', - 'osi_layer': 'APPLICATION', - 'severity': 'INFORMATIONAL', - 'attributes': { - 'id': project.id, - 'web_url': project.web_url, - 'full_name': project.path_with_namespace, - 'owner_type': project.namespace['kind'], - 'owner_id': project.namespace['id'], - 'owner_name': project.namespace['name'], - 'created_at': project.created_at, - 'last_activity_at': project.last_activity_at, - 'visibility': project.visibility + return { + 'name': 'GitLab Repo', + 'description': 'A GitLab repository', + 'category': 'Git Repository', + 'osi_layer': 'APPLICATION', + 'severity': 'INFORMATIONAL', + 'attributes': { + 'id': project.id, + 'web_url': project.web_url, + 'full_name': project.path_with_namespace, + 'owner_type': project.namespace['kind'], + 'owner_id': project.namespace['id'], + 'owner_name': project.namespace['name'], + 'created_at': project.created_at, + 'last_activity_at': project.last_activity_at, + 'visibility': project.visibility + } } - } def create_finding_github(repo: Repository): - return { - 'name': 'GitHub Repo', - 'description': 'A GitHub repository', - 'category': 'Git Repository', - 'osi_layer': 'APPLICATION', - 'severity': 'INFORMATIONAL', - 'attributes': { - 'id': repo.id, - 'web_url': repo.html_url, - 'full_name': repo.full_name, - 'owner_type': repo.owner.type, - 'owner_id': repo.owner.id, - 'owner_name': repo.owner.name, - 'created_at': repo.created_at.strftime("%Y-%m-%dT%H:%M:%SZ"), - 'last_activity_at': repo.updated_at.strftime("%Y-%m-%dT%H:%M:%SZ"), - 'visibility': 'private' if repo.private else 'public' + return { + 'name': 'GitHub Repo', + 'description': 'A GitHub repository', + 'category': 'Git Repository', + 'osi_layer': 'APPLICATION', + 'severity': 'INFORMATIONAL', + 'attributes': { + 'id': repo.id, + 'web_url': repo.html_url, + 'full_name': repo.full_name, + 'owner_type': repo.owner.type, + 'owner_id': repo.owner.id, + 'owner_name': repo.owner.name, + 'created_at': repo.created_at.strftime("%Y-%m-%dT%H:%M:%SZ"), + 'last_activity_at': repo.updated_at.strftime("%Y-%m-%dT%H:%M:%SZ"), + 'visibility': 'private' if repo.private else 'public' + } } - } if __name__ == '__main__': - main() + main() diff --git a/scanners/git-repo-scanner/scanner/git_repo_scanner/__init__.py b/scanners/git-repo-scanner/scanner/git_repo_scanner/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scanners/git-repo-scanner/scanner/git_repo_scanner/abstract_scanner.py b/scanners/git-repo-scanner/scanner/git_repo_scanner/abstract_scanner.py new file mode 100644 index 0000000000..e69de29bb2 From a789d7601a6f83664548ceb66d8ff29a2523e6a3 Mon Sep 17 00:00:00 2001 From: paulschmelzer Date: Fri, 26 Mar 2021 08:46:08 +0000 Subject: [PATCH 8/9] Updating Helm Docs --- scanners/git-repo-scanner/README.md | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/scanners/git-repo-scanner/README.md b/scanners/git-repo-scanner/README.md index f305bb2dd4..10258cc286 100644 --- a/scanners/git-repo-scanner/README.md +++ b/scanners/git-repo-scanner/README.md @@ -37,14 +37,10 @@ For type GitHub you can use the following options: - `--access-token`: Your personal GitHub access token. - `--ignore-repos`: A list of GitHub repository ids you want to ignore - `--obey-rate-limit`: True to obey the rate limit of the GitHub server (default), otherwise False -- `--activity-since-duration`: Return git repo findings with repo activity (e.g. commits) more recent than a specific - date expressed by a duration (now - duration). A duration string is a possibly signed sequence of decimal numbers, each - with an optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units - are 'm', 'h', 'd', 'w'. -- `--activity-until-duration`: Return git repo findings with repo activity (e.g. commits) older than a specific date - expressed by a duration (now - duration). A duration string is a possibly signed sequence of decimal numbers, each with - an optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm - ', 'h', 'd', 'w'. +- `--activity-since-duration`: Return git repo findings with repo activity (e.g. commits) more recent than a specific date expresed by an duration (now + duration). A duration string is a possibly signed sequence of decimal numbers, each + with optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm', 'h', 'd', 'w'. +- `--activity-until-duration`: Return git repo findings with repo activity (e.g. commits) older than a specific date expresed by an duration (now + duration). A duration string is a possibly signed sequence of decimal numbers, each with + optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm', 'h', 'd', 'w'. For now only organizations are supported so the option is mandatory. We **strongly recommend** providing an access token for authentication. If not provided the rate limiting will kick in after about 30 repositories scanned. @@ -53,18 +49,14 @@ for authentication. If not provided the rate limiting will kick in after about 3 For type GitLab you can use the following options: - `--url`: The url of the GitLab server. - `--access-token`: Your personal GitLab access token. -- `--group`: A specific GitLab group id you want to scan, including subgroups. +- `--group`: A specific GitLab group id you want to san, including subgroups. - `--ignore-groups`: A list of GitLab group ids you want to ignore - `--ignore-repos`: A list of GitLab project ids you want to ignore - `--obey-rate-limit`: True to obey the rate limit of the GitLab server (default), otherwise False -- `--activity-since-duration`: Return git repo findings with repo activity (e.g. commits) more recent than a specific - date expressed by a duration (now - duration). A duration string is a possibly signed sequence of decimal numbers, each - with an optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units - are 'm', 'h', 'd', 'w'. -- `--activity-until-duration`: Return git repo findings with repo activity (e.g. commits) older than a specific date - expressed by a duration (now - duration). A duration string is a possibly signed sequence of decimal numbers, each with - an optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm - ', 'h', 'd', 'w'. +- `--activity-since-duration`: Return git repo findings with repo activity (e.g. commits) more recent than a specific date expresed by an duration (now + duration). A duration string is a possibly signed sequence of decimal numbers, each + with optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm', 'h', 'd', 'w'. +- `--activity-until-duration`: Return git repo findings with repo activity (e.g. commits) older than a specific date expresed by an duration (now + duration). A duration string is a possibly signed sequence of decimal numbers, each with + optional fraction and a unit suffix, such as '1h' or '2h45m'. Valid time units are 'm', 'h', 'd', 'w'. For gitlab the url and the access token is mandatory. If you don't provide a specific group id all projects on the gitlab server are going to be discovered. From 2e927cc255b8cb2220b6dcf0c16e2650c1586b57 Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 26 Mar 2021 13:33:47 +0100 Subject: [PATCH 9/9] revert new structure --- scanners/git-repo-scanner/scanner/git_repo_scanner/__init__.py | 0 .../git-repo-scanner/scanner/git_repo_scanner/abstract_scanner.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 scanners/git-repo-scanner/scanner/git_repo_scanner/__init__.py delete mode 100644 scanners/git-repo-scanner/scanner/git_repo_scanner/abstract_scanner.py diff --git a/scanners/git-repo-scanner/scanner/git_repo_scanner/__init__.py b/scanners/git-repo-scanner/scanner/git_repo_scanner/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/scanners/git-repo-scanner/scanner/git_repo_scanner/abstract_scanner.py b/scanners/git-repo-scanner/scanner/git_repo_scanner/abstract_scanner.py deleted file mode 100644 index e69de29bb2..0000000000