From 7921b5fab980391c367bcc878d873b1961a7af27 Mon Sep 17 00:00:00 2001 From: Andrey Kabanov Date: Sun, 13 Feb 2022 18:46:41 -0800 Subject: [PATCH 01/30] Re-submitting PR #140 (Add additional base test suite tests) (#145) * add mac to gitignore * change testing repo * adjust `start_date_2` for new repo * add tap-tester automatic fields * add tap-tester all fields * add all expected streams to all fields test * set specific bookmark for test-repo * fix `collaborators` stream bookmark spelling for tap-tester * add more streams to automatic fields test * add tap-tester bookmarks * updates to automatic fields test: * add check for unique primary keys in replicated records * replace explicit set of expected streams with `expected_check_streams` from base * update `test_run` doc string * omit `team_memberships` stream from `expected_check_streams` * build expected_check_streams() using expected_streams() * add bug id and description * pylint fixes: * adjust imports * use specific error class * set encoding * adjust circle config: * use latest image * trim pylint disable options * add unit tests step * make sure integration tests run always * Re-submitting PR #141 (All repos for an organization) (#146) * add parsing of "org/*" wildcard to retreive all repos for an org * add unit test cases for extract_repos_from_config() * add requests-mock for dev requirements * add unit test for get_all_repos() * Re-submitting PR #142 (Improve Rate Limiting and Retry Logic) (#143) * add basic backoff retry * add backoff to setup.py * replace deprecated assertEquals method with assertEqual * add MAX_SLEEP_SECONDS parsing from config and DEFAULT_MAX_SECONDS for rate limiting * add comments for changes and use DEFAULT_SLEEP_SECONDS * add pylint ignore for global-statement * README updates: (#148) - add full list of replicated streams - update GitHub docs links * add streams to excluded_streams that aren't respecting automatic fields * add NotFoundException handling to collaborators stream * add bug info * adjust test for test-repo * pylint fixes * run unit and integations steps always * don't raise a NotFoundException to deal with access issues to resources * pylint fix and return empty response body for 404 * add collaborators stream to excluded set * fixes to tap-tester tests * adjust 2nd sync dates for data * deal with None from get method * adjust start date tap-tester dates * actually deal with NoneType in get method * FIX: sub_streams sync functions passed parent metadata * remove expected_check_streams after bug identified and addressed * use expected_streams after bug identified and addressed * don't write a bookmark for FULL_TABLE streams * update unit test expectations to recent changes * updates to bookmarks tap-tester: * adjust test expectatons for streams * create simulated_states based on test data and tap behavior * adjust tests based on commits and pr_commits schema * update base based on tap behavior and test data * adjust test expectations for team_members stream * Exclude collaborators stream due to access issues in circle * update circle config to include slack orb and tap-tester-user context * add bug info * add tap-tester-user to daily build context --- .circleci/config.yml | 31 +-- .gitignore | 5 + README.md | 32 ++- setup.py | 4 +- tap_github/__init__.py | 156 +++++++++---- tests/base.py | 34 +-- tests/test_github_all_fields.py | 90 ++++++++ tests/test_github_automatic_fields.py | 73 ++++++ tests/test_github_bookmarks.py | 207 ++++++++++++++++++ tests/test_github_start_date.py | 14 +- tests/unittests/test_exception_handling.py | 24 +- .../test_extract_repos_from_config.py | 32 +++ tests/unittests/test_formatting_dates.py | 4 +- tests/unittests/test_get_all_repos.py | 74 +++++++ tests/unittests/test_key_error.py | 36 +-- tests/unittests/test_rate_limit.py | 2 +- tests/unittests/test_start_date_bookmark.py | 8 +- tests/unittests/test_sub_streams_selection.py | 8 +- tests/unittests/test_verify_access.py | 20 +- 19 files changed, 699 insertions(+), 155 deletions(-) create mode 100644 tests/test_github_all_fields.py create mode 100644 tests/test_github_automatic_fields.py create mode 100644 tests/test_github_bookmarks.py create mode 100644 tests/unittests/test_extract_repos_from_config.py create mode 100644 tests/unittests/test_get_all_repos.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 78712556..a6969bf5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,8 +1,11 @@ -version: 2 +version: 2.1 +orbs: + slack: circleci/slack@3.4.2 + jobs: build: docker: - - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:tap-tester-v4 + - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:stitch-tap-tester steps: - checkout - run: @@ -21,7 +24,7 @@ jobs: name: 'pylint' command: | source /usr/local/share/virtualenvs/tap-github/bin/activate - pylint tap_github --disable 'broad-except,chained-comparison,empty-docstring,fixme,invalid-name,line-too-long,missing-class-docstring,missing-function-docstring,missing-module-docstring,no-else-raise,no-else-return,too-few-public-methods,too-many-arguments,too-many-branches,too-many-lines,too-many-locals,ungrouped-imports,wrong-spelling-in-comment,wrong-spelling-in-docstring,bad-whitespace' + pylint tap_github --disable 'missing-module-docstring,missing-function-docstring,missing-class-docstring,line-too-long,invalid-name,too-many-lines,consider-using-f-string,too-many-arguments,too-many-locals' - run: name: 'Unit Tests' command: | @@ -29,6 +32,7 @@ jobs: pip install nose coverage nosetests --with-coverage --cover-erase --cover-package=tap_github --cover-html-dir=htmlcov tests/unittests coverage html + when: always - store_test_results: path: test_output/report.xml - store_artifacts: @@ -39,20 +43,19 @@ jobs: aws s3 cp s3://com-stitchdata-dev-deployment-assets/environments/tap-tester/tap_tester_sandbox dev_env.sh source dev_env.sh source /usr/local/share/virtualenvs/tap-tester/bin/activate - run-test --tap=tap-github \ - --target=target-stitch \ - --orchestrator=stitch-orchestrator \ - --email=harrison+sandboxtest@stitchdata.com \ - --password=$SANDBOX_PASSWORD \ - --client-id=50 \ - --token=$STITCH_API_TOKEN \ - tests + run-test --tap=tap-github tests + when: always + - slack/notify-on-failure: + only_for_branches: master + workflows: version: 2 commit: jobs: - build: - context: circleci-user + context: + - circleci-user + - tap-tester-user build_daily: triggers: - schedule: @@ -63,4 +66,6 @@ workflows: - master jobs: - build: - context: circleci-user + context: + - circleci-user + - tap-tester-user diff --git a/.gitignore b/.gitignore index 59ed95a6..57a09a59 100644 --- a/.gitignore +++ b/.gitignore @@ -97,3 +97,8 @@ properties.json # Jetbrains IDE .idea + +# macOS +*.DS_Store +.AppleDouble +.LSOverride \ No newline at end of file diff --git a/README.md b/README.md index 12454261..3e956789 100644 --- a/README.md +++ b/README.md @@ -2,20 +2,32 @@ This is a [Singer](https://singer.io) tap that produces JSON-formatted data from the GitHub API following the [Singer -spec](https://github.com/singer-io/getting-started/blob/master/SPEC.md). +spec](https://github.com/singer-io/getting-started/blob/master/docs/SPEC.md). This tap: - Pulls raw data from the [GitHub REST API](https://developer.github.com/v3/) - Extracts the following resources from GitHub for a single repository: - - [Assignees](https://developer.github.com/v3/issues/assignees/#list-assignees) - - [Collaborators](https://developer.github.com/v3/repos/collaborators/#list-collaborators) - - [Commits](https://developer.github.com/v3/repos/commits/#list-commits-on-a-repository) - - [Issues](https://developer.github.com/v3/issues/#list-issues-for-a-repository) - - [Pull Requests](https://developer.github.com/v3/pulls/#list-pull-requests) - - [Comments](https://developer.github.com/v3/issues/comments/#list-comments-in-a-repository) - - [Reviews](https://developer.github.com/v3/pulls/reviews/#list-reviews-on-a-pull-request) - - [Review Comments](https://developer.github.com/v3/pulls/comments/) - - [Stargazers](https://developer.github.com/v3/activity/starring/#list-stargazers) + - [Assignees](https://docs.github.com/en/rest/reference/issues#list-assigneess) + - [Collaborators](https://docs.github.com/en/rest/reference/repos#list-repository-collaborators) + - [Commits](https://docs.github.com/en/rest/reference/repos#list-commits) + - [Commit Comments](https://docs.github.com/en/rest/reference/repos#list-commit-comments-for-a-repository) + - [Events](https://docs.github.com/en/rest/reference/issues#events) + - [Issues](https://docs.github.com/en/rest/reference/issues#list-repository-issues) + - [Issue Events](https://docs.github.com/en/rest/reference/issues#list-issue-events-for-a-repository) + - [Issue Milestones](https://docs.github.com/en/rest/reference/issues#list-milestones) + - [Projects](https://docs.github.com/en/rest/reference/projects#list-repository-projects) + - [Project Cards](https://docs.github.com/en/rest/reference/projects#list-project-cards) + - [Project Columns](https://docs.github.com/en/rest/reference/projects#list-project-columns) + - [Pull Requests](https://docs.github.com/en/rest/reference/pulls#list-pull-requests) + - [PR Commits](https://docs.github.com/en/rest/reference/pulls#list-commits-on-a-pull-request) + - [Releases](https://docs.github.com/en/rest/reference/repos#list-releases) + - [Comments](https://docs.github.com/en/rest/reference/issues#list-issue-comments-for-a-repository) + - [Reviews](https://docs.github.com/en/rest/reference/pulls#list-reviews-for-a-pull-request) + - [Review Comments](https://docs.github.com/en/rest/reference/pulls#list-review-comments-in-a-repository) + - [Stargazers](https://docs.github.com/en/rest/reference/activity#list-stargazers) + - [Teams](https://docs.github.com/en/rest/reference/teams#list-teams) + - [Team Members](https://docs.github.com/en/rest/reference/teams#list-team-members) + - [Team Memberships](https://docs.github.com/en/rest/reference/teams#get-team-membership-for-a-user) - Outputs the schema for each resource - Incrementally pulls data based on the input state diff --git a/setup.py b/setup.py index e37afe7e..4f8a4836 100644 --- a/setup.py +++ b/setup.py @@ -11,13 +11,15 @@ py_modules=['tap_github'], install_requires=[ 'singer-python==5.12.1', - 'requests==2.20.0' + 'requests==2.20.0', + 'backoff==1.8.0' ], extras_require={ 'dev': [ 'pylint==2.6.2', 'ipdb', 'nose', + 'requests-mock==1.9.3' ] }, entry_points=''' diff --git a/tap_github/__init__.py b/tap_github/__init__.py index 3d4536c8..1f4eebcc 100644 --- a/tap_github/__init__.py +++ b/tap_github/__init__.py @@ -1,15 +1,13 @@ -import argparse import os import json import collections import time import requests -import singer -import singer.bookmarks as bookmarks -import singer.metrics as metrics import backoff +import singer -from singer import metadata +from singer import (bookmarks, metrics, metadata) +from simplejson import JSONDecodeError session = requests.Session() logger = singer.get_logger() @@ -45,6 +43,9 @@ 'team_memberships': ['url'] } +DEFAULT_SLEEP_SECONDS = 600 +MAX_SLEEP_SECONDS = DEFAULT_SLEEP_SECONDS + class GithubException(Exception): pass @@ -101,7 +102,7 @@ class RateLimitExceeded(GithubException): }, 404: { "raise_exception": NotFoundException, - "message": "The resource you have specified cannot be found" + "message": "The resource you have specified cannot be found. Alternatively the access_token is not valid for the resource" }, 409: { "raise_exception": ConflictError, @@ -172,7 +173,7 @@ def raise_for_error(resp, source): error_code = resp.status_code try: response_json = resp.json() - except Exception: + except JSONDecodeError: response_json = {} if error_code == 404: @@ -180,9 +181,12 @@ def raise_for_error(resp, source): if source == "teams": details += ' or it is a personal account repository' message = "HTTP-error-code: 404, Error: {}. Please refer \'{}\' for more details.".format(details, response_json.get("documentation_url")) - else: - message = "HTTP-error-code: {}, Error: {}".format( - error_code, ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("message", "Unknown Error") if response_json == {} else response_json) + logger.info(message) + # don't raise a NotFoundException + return None + + message = "HTTP-error-code: {}, Error: {}".format( + error_code, ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("message", "Unknown Error") if response_json == {} else response_json) exc = ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("raise_exception", GithubException) raise exc(message) from None @@ -195,7 +199,7 @@ def rate_throttling(response): if int(response.headers['X-RateLimit-Remaining']) == 0: seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset'])) - if seconds_to_sleep > 600: + if seconds_to_sleep > MAX_SLEEP_SECONDS: message = "API rate limit exceeded, please try after {} seconds.".format(seconds_to_sleep) raise RateLimitExceeded(message) from None @@ -214,6 +218,9 @@ def authed_get(source, url, headers={}): raise_for_error(resp, source) timer.tags[metrics.Tag.http_status_code] = resp.status_code rate_throttling(resp) + if resp.status_code == 404: + # return an empty response body since we're not raising a NotFoundException + resp._content = b'{}' # pylint: disable=protected-access return resp def authed_get_all_pages(source, url, headers={}): @@ -249,7 +256,7 @@ def load_schemas(): for filename in os.listdir(get_abs_path('schemas')): path = get_abs_path('schemas') + '/' + filename file_raw = filename.replace('.json', '') - with open(path) as file: + with open(path, encoding='utf-8') as file: schemas[file_raw] = json.load(file) schemas['pr_commits'] = generate_pr_commit_schema(schemas['commits']) @@ -315,6 +322,57 @@ def get_catalog(): return {'streams': streams} +def get_all_repos(organizations: list) -> list: + """ + Retrieves all repositories for the provided organizations and + verifies basic access for them. + + Docs: https://docs.github.com/en/rest/reference/repos#list-organization-repositories + """ + repos = [] + + for org_path in organizations: + org = org_path.split('/')[0] + for response in authed_get_all_pages( + 'get_all_repos', + 'https://api.github.com/orgs/{}/repos?sort=created&direction=desc'.format(org) + ): + org_repos = response.json() + + for repo in org_repos: + repo_full_name = repo.get('full_name') + + logger.info("Verifying access of repository: %s", repo_full_name) + verify_repo_access( + 'https://api.github.com/repos/{}/commits'.format(repo_full_name), + repo + ) + + repos.append(repo_full_name) + + return repos + +def extract_repos_from_config(config: dict ) -> list: + """ + Extracts all repositories from the config and calls get_all_repos() + for organizations using the wildcard 'org/*' format. + """ + repo_paths = list(filter(None, config['repository'].split(' '))) + + orgs_with_all_repos = list(filter(lambda x: x.split('/')[1] == '*', repo_paths)) + + if orgs_with_all_repos: + # remove any wildcard "org/*" occurrences from `repo_paths` + repo_paths = list(set(repo_paths).difference(set(orgs_with_all_repos))) + + # get all repositores for an org in the config + all_repos = get_all_repos(orgs_with_all_repos) + + # update repo_paths + repo_paths.extend(all_repos) + + return repo_paths + def verify_repo_access(url_for_repo, repo): try: authed_get("verifying repository access", url_for_repo) @@ -328,7 +386,7 @@ def verify_access_for_repo(config): access_token = config['access_token'] session.headers.update({'authorization': 'token ' + access_token, 'per_page': '1', 'page': '1'}) - repositories = list(filter(None, config['repository'].split(' '))) + repositories = extract_repos_from_config(config) for repo in repositories: logger.info("Verifying access of repository: %s", repo) @@ -360,18 +418,16 @@ def get_all_teams(schemas, repo_path, state, mdata, _start_date): # transform and write release record with singer.Transformer() as transformer: - rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) + rec = transformer.transform(r, schemas['teams'], metadata=metadata.to_map(mdata['teams'])) singer.write_record('teams', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'teams', {'since': singer.utils.strftime(extraction_time)}) counter.increment() if schemas.get('team_members'): - for team_members_rec in get_all_team_members(team_slug, schemas['team_members'], repo_path, state, mdata): + for team_members_rec in get_all_team_members(team_slug, schemas['team_members'], repo_path, state, mdata['team_members']): singer.write_record('team_members', team_members_rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'team_members', {'since': singer.utils.strftime(extraction_time)}) if schemas.get('team_memberships'): - for team_memberships_rec in get_all_team_memberships(team_slug, schemas['team_memberships'], repo_path, state, mdata): + for team_memberships_rec in get_all_team_memberships(team_slug, schemas['team_memberships'], repo_path, state, mdata['team_memberships']): singer.write_record('team_memberships', team_memberships_rec, time_extracted=extraction_time) return state @@ -547,7 +603,6 @@ def get_all_issue_labels(schemas, repo_path, state, mdata, _start_date): with singer.Transformer() as transformer: rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) singer.write_record('issue_labels', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'issue_labels', {'since': singer.utils.strftime(extraction_time)}) counter.increment() return state @@ -616,7 +671,7 @@ def get_all_projects(schemas, repo_path, state, mdata, start_date): # transform and write release record with singer.Transformer() as transformer: - rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) + rec = transformer.transform(r, schemas['projects'], metadata=metadata.to_map(mdata['projects'])) singer.write_record('projects', rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'projects', {'since': singer.utils.strftime(extraction_time)}) counter.increment() @@ -627,14 +682,14 @@ def get_all_projects(schemas, repo_path, state, mdata, start_date): # sync project_columns if that schema is present (only there if selected) if schemas.get('project_columns'): - for project_column_rec in get_all_project_columns(project_id, schemas['project_columns'], repo_path, state, mdata, start_date): + for project_column_rec in get_all_project_columns(project_id, schemas['project_columns'], repo_path, state, mdata['project_columns'], start_date): singer.write_record('project_columns', project_column_rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'project_columns', {'since': singer.utils.strftime(extraction_time)}) # sync project_cards if that schema is present (only there if selected) if schemas.get('project_cards'): column_id = project_column_rec['id'] - for project_card_rec in get_all_project_cards(column_id, schemas['project_cards'], repo_path, state, mdata, start_date): + for project_card_rec in get_all_project_cards(column_id, schemas['project_cards'], repo_path, state, mdata['project_cards'], start_date): singer.write_record('project_cards', project_card_rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'project_cards', {'since': singer.utils.strftime(extraction_time)}) return state @@ -721,7 +776,6 @@ def get_all_releases(schemas, repo_path, state, mdata, _start_date): with singer.Transformer() as transformer: rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) singer.write_record('releases', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'releases', {'since': singer.utils.strftime(extraction_time)}) counter.increment() return state @@ -761,14 +815,14 @@ def get_all_pull_requests(schemas, repo_path, state, mdata, start_date): # transform and write pull_request record with singer.Transformer() as transformer: - rec = transformer.transform(pr, schemas['pull_requests'], metadata=metadata.to_map(mdata)) + rec = transformer.transform(pr, schemas['pull_requests'], metadata=metadata.to_map(mdata['pull_requests'])) singer.write_record('pull_requests', rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'pull_requests', {'since': singer.utils.strftime(extraction_time)}) counter.increment() # sync reviews if that schema is present (only there if selected) if schemas.get('reviews'): - for review_rec in get_reviews_for_pr(pr_num, schemas['reviews'], repo_path, state, mdata): + for review_rec in get_reviews_for_pr(pr_num, schemas['reviews'], repo_path, state, mdata['reviews']): singer.write_record('reviews', review_rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'reviews', {'since': singer.utils.strftime(extraction_time)}) @@ -776,7 +830,7 @@ def get_all_pull_requests(schemas, repo_path, state, mdata, start_date): # sync review comments if that schema is present (only there if selected) if schemas.get('review_comments'): - for review_comment_rec in get_review_comments_for_pr(pr_num, schemas['review_comments'], repo_path, state, mdata): + for review_comment_rec in get_review_comments_for_pr(pr_num, schemas['review_comments'], repo_path, state, mdata['review_comments']): singer.write_record('review_comments', review_comment_rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'review_comments', {'since': singer.utils.strftime(extraction_time)}) @@ -787,7 +841,7 @@ def get_all_pull_requests(schemas, repo_path, state, mdata, start_date): schemas['pr_commits'], repo_path, state, - mdata + mdata['pr_commits'] ): singer.write_record('pr_commits', pr_commit, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'pr_commits', {'since': singer.utils.strftime(extraction_time)}) @@ -859,7 +913,6 @@ def get_all_assignees(schema, repo_path, state, mdata, _start_date): with singer.Transformer() as transformer: rec = transformer.transform(assignee, schema, metadata=metadata.to_map(mdata)) singer.write_record('assignees', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'assignees', {'since': singer.utils.strftime(extraction_time)}) counter.increment() return state @@ -869,19 +922,26 @@ def get_all_collaborators(schema, repo_path, state, mdata, _start_date): https://developer.github.com/v3/repos/collaborators/#list-collaborators ''' with metrics.record_counter('collaborators') as counter: - for response in authed_get_all_pages( - 'collaborators', - 'https://api.github.com/repos/{}/collaborators'.format(repo_path) - ): - collaborators = response.json() - extraction_time = singer.utils.now() - for collaborator in collaborators: - collaborator['_sdc_repository'] = repo_path - with singer.Transformer() as transformer: - rec = transformer.transform(collaborator, schema, metadata=metadata.to_map(mdata)) - singer.write_record('collaborators', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'collaborator', {'since': singer.utils.strftime(extraction_time)}) - counter.increment() + try: + responses = authed_get_all_pages( + 'collaborators', + 'https://api.github.com/repos/{}/collaborators'.format(repo_path) + ) + except NotFoundException as error: + logger.info( + 'Unable to retreive collaborators stream, check access_token is valid for %s. See full error message: %s', + repo_path, error + ) + else: + for response in responses: + collaborators = response.json() + extraction_time = singer.utils.now() + for collaborator in collaborators: + collaborator['_sdc_repository'] = repo_path + with singer.Transformer() as transformer: + rec = transformer.transform(collaborator, schema, metadata=metadata.to_map(mdata)) + singer.write_record('collaborators', rec, time_extracted=extraction_time) + counter.increment() return state @@ -987,7 +1047,6 @@ def get_all_stargazers(schema, repo_path, state, mdata, _start_date): rec = transformer.transform(stargazer, schema, metadata=metadata.to_map(mdata)) rec['user_id'] = user_id singer.write_record('stargazers', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'stargazers', {'since': singer.utils.strftime(extraction_time)}) counter.increment() return state @@ -1064,7 +1123,7 @@ def do_sync(config, state, catalog): selected_stream_ids = get_selected_streams(catalog) validate_dependencies(selected_stream_ids) - repositories = list(filter(None, config['repository'].split(' '))) + repositories = extract_repos_from_config(config) state = translate_state(state, catalog, repositories) singer.write_state(state) @@ -1096,17 +1155,19 @@ def do_sync(config, state, catalog): # handle streams with sub streams else: stream_schemas = {stream_id: stream_schema} + stream_mdata = {stream_id: mdata} # get and write selected sub stream schemas for sub_stream_id in sub_stream_ids: if sub_stream_id in selected_stream_ids: sub_stream = get_stream_from_catalog(sub_stream_id, catalog) stream_schemas[sub_stream_id] = sub_stream['schema'] + stream_mdata[sub_stream_id] = sub_stream['metadata'] singer.write_schema(sub_stream_id, sub_stream['schema'], sub_stream['key_properties']) # sync stream and it's sub streams - state = sync_func(stream_schemas, repo, state, mdata, start_date) + state = sync_func(stream_schemas, repo, state, stream_mdata, start_date) singer.write_state(state) @@ -1114,6 +1175,13 @@ def do_sync(config, state, catalog): def main(): args = singer.utils.parse_args(REQUIRED_CONFIG_KEYS) + # get optional config key `max_sleep_seconds` + config_max_sleep = args.config.get('max_sleep_seconds') + + # set global `MAX_SLEEP_SECONDS` for rate_throttling function or use default + global MAX_SLEEP_SECONDS #pylint: disable=global-statement + MAX_SLEEP_SECONDS = config_max_sleep if config_max_sleep else DEFAULT_SLEEP_SECONDS + if args.discover: do_discover(args.config) else: diff --git a/tests/base.py b/tests/base.py index 7b204e64..6ea415a4 100644 --- a/tests/base.py +++ b/tests/base.py @@ -46,8 +46,8 @@ def get_properties(self, original: bool = True): :param original: set to false to change the start_date or end_date """ return_value = { - 'start_date' : dt.strftime(dt.utcnow()-timedelta(days=5), self.START_DATE_FORMAT), - 'repository': 'singer-io/tap-github' + 'start_date' : '2021-10-01T00:00:00Z', + 'repository': 'singer-io/test-repo' } if original: return return_value @@ -61,32 +61,6 @@ def get_credentials(self): 'access_token': os.getenv("TAP_GITHUB_TOKEN") } - @staticmethod - def expected_check_streams(): - return { - 'assignees', - 'collaborators', - 'comments', - 'commit_comments', - 'commits', - 'events', - 'issue_labels', - 'issue_milestones', - 'issue_events', - 'issues', - 'pr_commits', - 'project_cards', - 'project_columns', - 'projects', - 'pull_requests', - 'releases', - 'review_comments', - 'reviews', - 'stargazers', - 'team_members', - 'team_memberships', - 'teams' - } def expected_metadata(self): """The expected streams and metadata about the streams""" @@ -134,7 +108,7 @@ def expected_metadata(self): "issue_milestones": { self.PRIMARY_KEYS: {"id"}, self.REPLICATION_METHOD: self.INCREMENTAL, - self.BOOKMARK: {"due_on"}, + self.BOOKMARK: {"updated_at"}, self.OBEYS_START_DATE: True }, "issue_events": { @@ -193,7 +167,7 @@ def expected_metadata(self): "reviews": { self.PRIMARY_KEYS: {"id"}, self.REPLICATION_METHOD: self.INCREMENTAL, - self.BOOKMARK: {"updated_at"}, + self.BOOKMARK: {"submitted_at"}, self.OBEYS_START_DATE: True }, "stargazers": { diff --git a/tests/test_github_all_fields.py b/tests/test_github_all_fields.py new file mode 100644 index 00000000..17173dc1 --- /dev/null +++ b/tests/test_github_all_fields.py @@ -0,0 +1,90 @@ +import os + +from tap_tester import runner, connections, menagerie + +from base import TestGithubBase + + +class TestGithubAllFields(TestGithubBase): + """Test that with all fields selected for a stream automatic and available fields are replicated""" + + @staticmethod + def name(): + return "tap_tester_github_all_fields" + + def test_run(self): + """ + Ensure running the tap with all streams and fields selected results in the + replication of all fields. + - Verify no unexpected streams were replicated + - Verify that more than just the automatic fields are replicated for each stream. + """ + # BUG TDL-16672 + # The excluded streams are not honoring all fields selection + excluded_streams = { + 'issue_events', + 'comments', + 'projects', + 'pr_commits', + 'events', + 'review_comments', + 'issues', + 'project_cards', + 'project_columns', + 'commits', + 'collaborators' + } + + expected_streams = self.expected_streams() - excluded_streams + + # instantiate connection + conn_id = connections.ensure_connection(self) + + # run check mode + found_catalogs = self.run_and_verify_check_mode(conn_id) + + # table and field selection + test_catalogs_all_fields = [catalog for catalog in found_catalogs + if catalog.get('stream_name') in expected_streams] + self.perform_and_verify_table_and_field_selection( + conn_id, test_catalogs_all_fields, select_all_fields=True, + ) + + # grab metadata after performing table-and-field selection to set expectations + stream_to_all_catalog_fields = dict() # used for asserting all fields are replicated + for catalog in test_catalogs_all_fields: + stream_id, stream_name = catalog['stream_id'], catalog['stream_name'] + catalog_entry = menagerie.get_annotated_schema(conn_id, stream_id) + fields_from_field_level_md = [md_entry['breadcrumb'][1] + for md_entry in catalog_entry['metadata'] + if md_entry['breadcrumb'] != []] + stream_to_all_catalog_fields[stream_name] = set(fields_from_field_level_md) + + # run initial sync + record_count_by_stream = self.run_and_verify_sync(conn_id) + synced_records = runner.get_records_from_target_output() + + # Verify no unexpected streams were replicated + synced_stream_names = set(synced_records.keys()) + self.assertSetEqual(expected_streams, synced_stream_names) + + for stream in expected_streams: + with self.subTest(stream=stream): + # expected values + expected_automatic_keys = self.expected_primary_keys().get(stream) + + # get all expected keys + expected_all_keys = stream_to_all_catalog_fields[stream] + + # collect actual values + messages = synced_records.get(stream) + actual_all_keys = [set(message['data'].keys()) for message in messages['messages'] + if message['action'] == 'upsert'][0] + + # Verify that you get some records for each stream + self.assertGreater(record_count_by_stream.get(stream, -1), 0) + + # verify all fields for a stream were replicated + self.assertGreater(len(expected_all_keys), len(expected_automatic_keys)) + self.assertTrue(expected_automatic_keys.issubset(expected_all_keys), msg=f'{expected_automatic_keys-expected_all_keys} is not in "expected_all_keys"') + self.assertSetEqual(expected_all_keys, actual_all_keys) diff --git a/tests/test_github_automatic_fields.py b/tests/test_github_automatic_fields.py new file mode 100644 index 00000000..03ae904f --- /dev/null +++ b/tests/test_github_automatic_fields.py @@ -0,0 +1,73 @@ +""" +Test that with no fields selected for a stream automatic fields are still replicated +""" +from tap_tester import runner, connections + +from base import TestGithubBase + + +class TestGithubAutomaticFields(TestGithubBase): + """Test that with no fields selected for a stream automatic fields are still replicated""" + + @staticmethod + def name(): + return "tap_tester_github_automatic_fields" + + def test_run(self): + """ + - Verify that for each stream you can get multiple pages of data + when no fields are selected. + - Verify that only the automatic fields are sent to the target. + - Verify that all replicated records have unique primary key values. + """ + # Exclude collaborators stream due to access issues in circle + expected_streams = self.expected_streams() - {'collaborators'} + + # instantiate connection + conn_id = connections.ensure_connection(self) + + # run check mode + found_catalogs = self.run_and_verify_check_mode(conn_id) + + # table and field selection + test_catalogs_automatic_fields = [catalog for catalog in found_catalogs + if catalog.get('stream_name') in expected_streams] + + self.perform_and_verify_table_and_field_selection( + conn_id, test_catalogs_automatic_fields, select_all_fields=False, + ) + + # run initial sync + record_count_by_stream = self.run_and_verify_sync(conn_id) + synced_records = runner.get_records_from_target_output() + + for stream in expected_streams: + with self.subTest(stream=stream): + # expected values + expected_keys = self.expected_primary_keys().get(stream) + + # collect actual values + data = synced_records.get(stream, {}) + record_messages_keys = [set(row.get('data').keys()) for row in data.get('messages', {})] + primary_keys_list = [ + tuple(message.get('data').get(expected_pk) for expected_pk in expected_keys) + for message in data.get('messages') + if message.get('action') == 'upsert'] + unique_primary_keys_list = set(primary_keys_list) + + # Verify that you get some records for each stream + self.assertGreater( + record_count_by_stream.get(stream, -1), 0, + msg="The number of records is not over the stream max limit for the {} stream".format(stream)) + + # Verify that only the automatic fields are sent to the target + for actual_keys in record_messages_keys: + self.assertSetEqual(expected_keys, actual_keys) + + # BUG-TDL-17507 An org can have multiple teams with overlapping membership + if stream != 'team_members': + # Verify that all replicated records have unique primary key values. + self.assertEqual( + len(primary_keys_list), + len(unique_primary_keys_list), + msg="Replicated record does not have unique primary key values.") diff --git a/tests/test_github_bookmarks.py b/tests/test_github_bookmarks.py new file mode 100644 index 00000000..3520a9d8 --- /dev/null +++ b/tests/test_github_bookmarks.py @@ -0,0 +1,207 @@ +import datetime +import dateutil.parser +import pytz + +from tap_tester import runner, menagerie, connections + +from base import TestGithubBase + + +class TestGithubBookmarks(TestGithubBase): + @staticmethod + def name(): + return "tap_tester_github_bookmarks" + + @staticmethod + def convert_state_to_utc(date_str): + """ + Convert a saved bookmark value of the form '2020-08-25T13:17:36-07:00' to + a string formatted utc datetime, + in order to compare against json formatted datetime values + """ + date_object = dateutil.parser.parse(date_str) + date_object_utc = date_object.astimezone(tz=pytz.UTC) + return datetime.datetime.strftime(date_object_utc, "%Y-%m-%dT%H:%M:%SZ") + + def calculated_states_by_stream(self, current_state, synced_records, replication_keys): + """ + Look at the bookmarks from a previous sync and set a new bookmark + value based off timedelta expectations. This ensures the subsequent sync will replicate + at least 1 record but, fewer records than the previous sync. + + If the test data is changed in the future this will break expectations for this test. + """ + timedelta_by_stream = {stream: [90,0,0] # {stream_name: [days, hours, minutes], ...} + for stream in self.expected_streams()} + timedelta_by_stream['comments'] = [7, 0, 0] + timedelta_by_stream['commit_comments'] = [0, 0, 1] + timedelta_by_stream['commits'] = [0, 17, 0] + timedelta_by_stream['issue_events'] = [1, 0, 0] + timedelta_by_stream['issue_milestones'] = [0, 1, 0] + timedelta_by_stream['issues'] = [7, 0, 0] + timedelta_by_stream['pull_requests'] = [7, 0, 0] + + repo = self.get_properties().get('repository') + + stream_to_calculated_state = {stream: "" for stream in current_state['bookmarks'][repo].keys()} + for stream, state in current_state['bookmarks'][repo].items(): + state_key, state_value = next(iter(state.keys())), next(iter(state.values())) + sync_messages = [record.get('data') for record in + synced_records.get(stream, {'messages': []}).get('messages') + if record.get('action') == 'upsert'] + + # the `commits` and `pr_commits` streams don't have a top level replication_key field + if stream in ('commits', 'pr_commits'): + max_record_values = [values.get('commit', {}).get('committer', {}).get('date') + for values in sync_messages] + max_value = max(max_record_values) + else: + replication_key = next(iter(replication_keys.get(stream))) + max_record_values = [values.get(replication_key) for values in sync_messages] + max_value = max(max_record_values) + + # this is because the tap uses `time_extracted` to bookmark with `since` at execution + new_state_value = min(max_value, state_value) + state_as_datetime = dateutil.parser.parse(new_state_value) + + days, hours, minutes = timedelta_by_stream[stream] + calculated_state_as_datetime = state_as_datetime - datetime.timedelta(days=days, hours=hours, minutes=minutes) + + state_format = '%Y-%m-%dT%H:%M:%S-00:00' + calculated_state_formatted = datetime.datetime.strftime(calculated_state_as_datetime, state_format) + + stream_to_calculated_state[stream] = {state_key: calculated_state_formatted} + + return stream_to_calculated_state + + + def test_run(self): + # Exclude collaborators stream due to access issues in circle + expected_streams = self.expected_streams() - {'collaborators'} + + expected_replication_keys = self.expected_bookmark_keys() + expected_replication_methods = self.expected_replication_method() + + repo = self.get_properties().get('repository') + + ########################################################################## + ### First Sync + ########################################################################## + + conn_id = connections.ensure_connection(self, original_properties=True) + + # Run in check mode + found_catalogs = self.run_and_verify_check_mode(conn_id) + + # Select only the expected streams tables + catalog_entries = [ce for ce in found_catalogs if ce['tap_stream_id'] in expected_streams] + self.perform_and_verify_table_and_field_selection(conn_id, catalog_entries, select_all_fields=True) + + # Run a sync job using orchestrator + first_sync_record_count = self.run_and_verify_sync(conn_id) + first_sync_records = runner.get_records_from_target_output() + first_sync_bookmarks = menagerie.get_state(conn_id) + + ########################################################################## + ### Update State Between Syncs + ########################################################################## + + new_states = {'bookmarks': dict()} + simulated_states = self.calculated_states_by_stream(first_sync_bookmarks, + first_sync_records, expected_replication_keys) + for stream, new_state in simulated_states.items(): + new_states['bookmarks'][stream] = new_state + menagerie.set_state(conn_id, new_states) + + ########################################################################## + ### Second Sync + ########################################################################## + + second_sync_record_count = self.run_and_verify_sync(conn_id) + second_sync_records = runner.get_records_from_target_output() + second_sync_bookmarks = menagerie.get_state(conn_id) + + ########################################################################## + ### Test By Stream + ########################################################################## + + for stream in expected_streams: + with self.subTest(stream=stream): + + # expected values + expected_replication_method = expected_replication_methods[stream] + + # collect information for assertions from syncs 1 & 2 base on expected values + first_sync_count = first_sync_record_count.get(stream, 0) + second_sync_count = second_sync_record_count.get(stream, 0) + first_sync_messages = [record.get('data') for record in + first_sync_records.get(stream, {'messages': []}).get('messages') + if record.get('action') == 'upsert'] + second_sync_messages = [record.get('data') for record in + second_sync_records.get(stream, {'messages': []}).get('messages') + if record.get('action') == 'upsert'] + first_bookmark_key_value = first_sync_bookmarks.get('bookmarks', {}).get(repo, {stream: None}).get(stream) + second_bookmark_key_value = second_sync_bookmarks.get('bookmarks', {}).get(repo, {stream: None}).get(stream) + + + if expected_replication_method == self.INCREMENTAL: + # collect information specific to incremental streams from syncs 1 & 2 + replication_key = next(iter(expected_replication_keys[stream])) + first_bookmark_value = first_bookmark_key_value.get('since') + second_bookmark_value = second_bookmark_key_value.get('since') + first_bookmark_value_utc = self.convert_state_to_utc(first_bookmark_value) + second_bookmark_value_utc = self.convert_state_to_utc(second_bookmark_value) + + # Verify the first sync sets a bookmark of the expected form + self.assertIsNotNone(first_bookmark_key_value) + self.assertIsNotNone(first_bookmark_key_value.get('since')) + + # Verify the second sync sets a bookmark of the expected form + self.assertIsNotNone(second_bookmark_key_value) + self.assertIsNotNone(second_bookmark_key_value.get('since')) + + # Verify the second sync bookmark is Equal or Greater than the first sync bookmark + # the tap uses `time_extracted` and sets a bookmark using `since` for all real/pseudo incremental streams + self.assertGreaterEqual(second_bookmark_value, first_bookmark_value) + + for record in second_sync_messages: + # Verify the second sync bookmark value is the max replication key value for a given stream + if stream in ('commits', 'pr_commits'): + replication_key_value = record.get('commit', {}).get('committer', {}).get('date') + else: + replication_key_value = record.get(replication_key) + self.assertLessEqual( + replication_key_value, second_bookmark_value_utc, + msg="Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced." + ) + + for record in first_sync_messages: + # Verify the first sync bookmark value is the max replication key value for a given stream + if stream in ('commits', 'pr_commits'): + replication_key_value = record.get('commit', {}).get('committer', {}).get('date') + else: + replication_key_value = record.get(replication_key) + self.assertLessEqual( + replication_key_value, first_bookmark_value_utc, + msg="First sync bookmark was set incorrectly, a record with a greater replication-key value was synced." + ) + + # Verify the number of records in the 2nd sync is less then the first + self.assertLessEqual(second_sync_count, first_sync_count) + + + elif expected_replication_method == self.FULL: + # Verify the syncs do not set a bookmark for full table streams + self.assertIsNone(first_bookmark_key_value) + self.assertIsNone(second_bookmark_key_value) + + # Verify the number of records in the second sync is the same as the first + self.assertEqual(second_sync_count, first_sync_count) + + else: + raise NotImplementedError( + "INVALID EXPECTATIONS\t\tSTREAM: {} REPLICATION_METHOD: {}".format(stream, expected_replication_method) + ) + + # Verify at least 1 record was replicated in the second sync + self.assertGreater(second_sync_count, 0, msg="We are not fully testing bookmarking for {}".format(stream)) diff --git a/tests/test_github_start_date.py b/tests/test_github_start_date.py index e48ab584..34065255 100644 --- a/tests/test_github_start_date.py +++ b/tests/test_github_start_date.py @@ -18,7 +18,7 @@ def name(): def generate_data(self): # get the token token = os.getenv("TAP_GITHUB_TOKEN") - url = "https://api.github.com/user/starred/singer-io/tap-github" + url = "https://api.github.com/user/starred/singer-io/test-repo" headers = {"Authorization": "Bearer {}".format(token)} # generate a data for 'events' stream: 'watchEvent' ie. star the repo @@ -33,13 +33,13 @@ def test_run(self): # run the test for all the streams excluding 'events' stream # as for 'events' stream we have to use dynamic dates - self.run_test('2020-04-01T00:00:00Z', '2021-06-10T00:00:00Z', self.expected_streams() - {'events'}) + self.run_test('2020-04-01T00:00:00Z', '2021-10-08T00:00:00Z', self.expected_streams() - {'events'}) # As per the Documentation: https://docs.github.com/en/rest/reference/activity#events # the 'events' of past 90 days will only be returned # if there are no events in past 90 days, then there will be '304 Not Modified' error today = datetime.today() - date_1 = datetime.strftime(today - timedelta(days=4), "%Y-%m-%dT00:00:00Z") + date_1 = datetime.strftime(today - timedelta(days=90), "%Y-%m-%dT00:00:00Z") date_2 = datetime.strftime(today - timedelta(days=1), "%Y-%m-%dT00:00:00Z") # run the test for 'events' stream self.run_test(date_1, date_2, {'events'}) @@ -126,11 +126,11 @@ def run_test(self, date_1, date_2, streams): # collect information for assertions from syncs 1 & 2 base on expected values record_count_sync_1 = record_count_by_stream_1.get(stream, 0) record_count_sync_2 = record_count_by_stream_2.get(stream, 0) - primary_keys_list_1 = [tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) - for message in synced_records_1.get(stream).get('messages') + primary_keys_list_1 = [tuple(message.get('data', {}).get(expected_pk) for expected_pk in expected_primary_keys) + for message in synced_records_1.get(stream, {'messages': []}).get('messages') if message.get('action') == 'upsert'] - primary_keys_list_2 = [tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) - for message in synced_records_2.get(stream).get('messages') + primary_keys_list_2 = [tuple(message.get('data', {}).get(expected_pk) for expected_pk in expected_primary_keys) + for message in synced_records_2.get(stream, {'messages': []}).get('messages') if message.get('action') == 'upsert'] primary_keys_sync_1 = set(primary_keys_list_1) diff --git a/tests/unittests/test_exception_handling.py b/tests/unittests/test_exception_handling.py index 8036e093..e2c86120 100644 --- a/tests/unittests/test_exception_handling.py +++ b/tests/unittests/test_exception_handling.py @@ -32,7 +32,7 @@ def test_zero_content_length(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.BadRequestException as e: - self.assertEquals(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") + self.assertEqual(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") def test_400_error(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(400, raise_error = True) @@ -40,7 +40,7 @@ def test_400_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.BadRequestException as e: - self.assertEquals(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") + self.assertEqual(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") def test_401_error(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(401, raise_error = True) @@ -48,7 +48,7 @@ def test_401_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.BadCredentialsException as e: - self.assertEquals(str(e), "HTTP-error-code: 401, Error: Invalid authorization credentials.") + self.assertEqual(str(e), "HTTP-error-code: 401, Error: Invalid authorization credentials.") def test_403_error(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(403, raise_error = True) @@ -56,7 +56,7 @@ def test_403_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.AuthException as e: - self.assertEquals(str(e), "HTTP-error-code: 403, Error: User doesn't have permission to access the resource.") + self.assertEqual(str(e), "HTTP-error-code: 403, Error: User doesn't have permission to access the resource.") def test_404_error(self, mocked_parse_args, mocked_request): json = {"message": "Not Found", "documentation_url": "https:/docs.github.com/"} @@ -65,7 +65,7 @@ def test_404_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.NotFoundException as e: - self.assertEquals(str(e), "HTTP-error-code: 404, Error: The resource you have specified cannot be found. Please refer '{}' for more details.".format(json.get("documentation_url"))) + self.assertEqual(str(e), "HTTP-error-code: 404, Error: The resource you have specified cannot be found. Please refer '{}' for more details.".format(json.get("documentation_url"))) def test_404_error_for_teams(self, mocked_parse_args, mocked_request): json = {"message": "Not Found", "documentation_url": "https:/docs.github.com/"} @@ -73,7 +73,7 @@ def test_404_error_for_teams(self, mocked_parse_args, mocked_request): try: tap_github.raise_for_error(get_response(404, json = json, raise_error = True), "teams") except tap_github.NotFoundException as e: - self.assertEquals(str(e), "HTTP-error-code: 404, Error: The resource you have specified cannot be found or it is a personal account repository. Please refer '{}' for more details.".format(json.get("documentation_url"))) + self.assertEqual(str(e), "HTTP-error-code: 404, Error: The resource you have specified cannot be found or it is a personal account repository. Please refer '{}' for more details.".format(json.get("documentation_url"))) def test_500_error(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(500, raise_error = True) @@ -81,7 +81,7 @@ def test_500_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.InternalServerError as e: - self.assertEquals(str(e), "HTTP-error-code: 500, Error: An error has occurred at Github's end.") + self.assertEqual(str(e), "HTTP-error-code: 500, Error: An error has occurred at Github's end.") def test_301_error(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(301, raise_error = True) @@ -89,7 +89,7 @@ def test_301_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.MovedPermanentlyError as e: - self.assertEquals(str(e), "HTTP-error-code: 301, Error: The resource you are looking for is moved to another URL.") + self.assertEqual(str(e), "HTTP-error-code: 301, Error: The resource you are looking for is moved to another URL.") def test_304_error(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(304, raise_error = True) @@ -97,7 +97,7 @@ def test_304_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.NotModifiedError as e: - self.assertEquals(str(e), "HTTP-error-code: 304, Error: The requested resource has not been modified since the last time you accessed it.") + self.assertEqual(str(e), "HTTP-error-code: 304, Error: The requested resource has not been modified since the last time you accessed it.") def test_422_error(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(422, raise_error = True) @@ -105,7 +105,7 @@ def test_422_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.UnprocessableError as e: - self.assertEquals(str(e), "HTTP-error-code: 422, Error: The request was not able to process right now.") + self.assertEqual(str(e), "HTTP-error-code: 422, Error: The request was not able to process right now.") def test_409_error(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(409, raise_error = True) @@ -113,11 +113,11 @@ def test_409_error(self, mocked_parse_args, mocked_request): try: tap_github.authed_get("", "") except tap_github.ConflictError as e: - self.assertEquals(str(e), "HTTP-error-code: 409, Error: The request could not be completed due to a conflict with the current state of the server.") + self.assertEqual(str(e), "HTTP-error-code: 409, Error: The request could not be completed due to a conflict with the current state of the server.") def test_200_success(self, mocked_parse_args, mocked_request): json = {"key": "value"} mocked_request.return_value = get_response(200, json) resp = tap_github.authed_get("", "") - self.assertEquals(json, resp.json()) + self.assertEqual(json, resp.json()) diff --git a/tests/unittests/test_extract_repos_from_config.py b/tests/unittests/test_extract_repos_from_config.py new file mode 100644 index 00000000..4a205696 --- /dev/null +++ b/tests/unittests/test_extract_repos_from_config.py @@ -0,0 +1,32 @@ +import unittest +import tap_github + + +@unittest.mock.patch('tap_github.get_all_repos') +class TestExtractReposFromConfig(unittest.TestCase): + + def test_single_repo(self, mocked_get_all_repos): + config = {'repository': 'singer-io/test-repo'} + expected_repositories = ['singer-io/test-repo'] + self.assertEqual(expected_repositories, tap_github.extract_repos_from_config(config)) + + def test_multiple_repos(self, mocked_get_all_repos): + config = {'repository': 'singer-io/test-repo singer-io/tap-github'} + expected_repositories = ['singer-io/test-repo', 'singer-io/tap-github'] + self.assertEqual(expected_repositories, tap_github.extract_repos_from_config(config)) + + def test_org_all_repos(self, mocked_get_all_repos): + config = {'repository': 'singer-io/test-repo test-org/*'} + expected_repositories = [ + 'singer-io/test-repo', + 'test-org/repo1', + 'test-org/repo2', + 'test-org/repo3' + ] + mocked_get_all_repos.return_value = [ + 'test-org/repo1', + 'test-org/repo2', + 'test-org/repo3' + ] + + self.assertEqual(expected_repositories, tap_github.extract_repos_from_config(config)) diff --git a/tests/unittests/test_formatting_dates.py b/tests/unittests/test_formatting_dates.py index 701b2714..72a70925 100644 --- a/tests/unittests/test_formatting_dates.py +++ b/tests/unittests/test_formatting_dates.py @@ -91,7 +91,7 @@ def test_due_on_not_none_2(self, mocked_request): final_state = tap_github.get_all_issue_milestones({}, repo_path, init_state, {}, "") # as we will get 0 records, initial and final bookmark will be same - self.assertEquals(init_bookmark, final_state["bookmarks"][repo_path]["issue_milestones"]["since"]) + self.assertEqual(init_bookmark, final_state["bookmarks"][repo_path]["issue_milestones"]["since"]) @mock.patch("singer.write_record") def test_data_containing_both_values(self, mocked_write_record, mocked_request): @@ -117,4 +117,4 @@ def test_data_containing_both_values(self, mocked_write_record, mocked_request): # as we will get 2 record, final bookmark will be greater than initial bookmark self.assertGreater(last_bookmark, init_bookmark) # as we will get 2 record, write_records will also be called 2 times - self.assertEquals(mocked_write_record.call_count, 2) + self.assertEqual(mocked_write_record.call_count, 2) diff --git a/tests/unittests/test_get_all_repos.py b/tests/unittests/test_get_all_repos.py new file mode 100644 index 00000000..c8ca7a0b --- /dev/null +++ b/tests/unittests/test_get_all_repos.py @@ -0,0 +1,74 @@ +import unittest +import requests +import requests_mock +import simplejson as json + +import tap_github + +from itertools import cycle + + +SESSION = requests.Session() +ADAPTER = requests_mock.Adapter() +SESSION.mount('mock://', ADAPTER) + + +@unittest.mock.patch('tap_github.verify_repo_access') +@unittest.mock.patch('tap_github.authed_get_all_pages') +class TestGetAllRepos(unittest.TestCase): + + def test_single_organization(self, mocked_authed_get_all_pages, mocked_verify_repo_access): + orgs = ['test-org/*'] + repos = ['repo1', 'repo2', 'repo3'] + + mocked_url = 'mock://github.com/orgs/test-org/repos' + mocked_response_body = [ + {'full_name': ''.join(r).replace('*', '')} for r in zip(cycle(orgs), repos) + ] + mocked_response_text = json.dumps(mocked_response_body) + ADAPTER.register_uri( + 'GET', + mocked_url, + text=mocked_response_text) + mocked_response = SESSION.get(mocked_url) + + expected_repositories = [ + 'test-org/repo1', + 'test-org/repo2', + 'test-org/repo3' + ] + mocked_authed_get_all_pages.return_value = [mocked_response] + + self.assertEqual(expected_repositories, tap_github.get_all_repos(orgs)) + + def test_multiple_organizations(self, mocked_authed_get_all_pages, mocked_verify_repo_access): + orgs = ['test-org/*', 'singer-io/*'] + repos = ['repo1', 'repo2', 'repo3'] + + mocked_url = 'mock://github.com/orgs/test-org/repos' + side_effect = [] + for org in orgs: + mocked_response_body = [ + {'full_name': ''.join(r).replace('*', '')} for r in zip(cycle([org]), repos) + ] + ADAPTER.register_uri( + 'GET', + mocked_url, + text=json.dumps(mocked_response_body)) + mocked_response = SESSION.get(mocked_url) + mocked_authed_get_all_pages.return_value = [mocked_response] + + call_response = tap_github.get_all_repos([org]) + + side_effect.extend(call_response) + + expected_repositories = [ + 'test-org/repo1', + 'test-org/repo2', + 'test-org/repo3', + 'singer-io/repo1', + 'singer-io/repo2', + 'singer-io/repo3' + ] + + self.assertListEqual(expected_repositories, side_effect) diff --git a/tests/unittests/test_key_error.py b/tests/unittests/test_key_error.py index ab23600d..7e5bb28c 100644 --- a/tests/unittests/test_key_error.py +++ b/tests/unittests/test_key_error.py @@ -23,21 +23,22 @@ def test_slug_sub_stream_selected_slug_selected(self, mocked_team_members, mocke mocked_request.return_value = get_response(json) schemas = {"teams": "None", "team_members": "None"} - mdata =[ + mdata_slug = [ { 'breadcrumb': [], 'metadata': {'selected': True, 'table-key-properties': ['id']} - }, + }, { 'breadcrumb': ['properties', 'slug'], 'metadata': {'inclusion': 'available'} - }, + }, { "breadcrumb": [ "properties", "name"], "metadata": {"inclusion": "available"} }] + mdata = {"teams": mdata_slug, "team_members": mdata_slug} tap_github.get_all_teams(schemas, "tap-github", {}, mdata, "") - self.assertEquals(mocked_team_members.call_count, 1) + self.assertEqual(mocked_team_members.call_count, 1) @mock.patch("tap_github.__init__.get_all_team_members") def test_slug_sub_stream_not_selected_slug_selected(self, mocked_team_members, mocked_request): @@ -46,9 +47,9 @@ def test_slug_sub_stream_not_selected_slug_selected(self, mocked_team_members, m mocked_request.return_value = get_response(json) schemas = {"teams": "None"} - mdata =[ + mdata = {"teams": [ { - 'breadcrumb': [], + 'breadcrumb': [], 'metadata': {'selected': True, 'table-key-properties': ['id']} }, { @@ -58,9 +59,9 @@ def test_slug_sub_stream_not_selected_slug_selected(self, mocked_team_members, m { "breadcrumb": [ "properties", "name"], "metadata": {"inclusion": "available"} - }] + }]} tap_github.get_all_teams(schemas, "tap-github", {}, mdata, "") - self.assertEquals(mocked_team_members.call_count, 0) + self.assertEqual(mocked_team_members.call_count, 0) @mock.patch("tap_github.__init__.get_all_team_members") def test_slug_sub_stream_selected_slug_not_selected(self, mocked_team_members, mocked_request): @@ -69,7 +70,7 @@ def test_slug_sub_stream_selected_slug_not_selected(self, mocked_team_members, m mocked_request.return_value = get_response(json) schemas = {"teams": "None", "team_members": "None"} - mdata =[ + mdata_slug = [ { 'breadcrumb': [], 'metadata': {'selected': True, 'table-key-properties': ['id']} @@ -82,8 +83,9 @@ def test_slug_sub_stream_selected_slug_not_selected(self, mocked_team_members, m "breadcrumb": [ "properties", "name"], "metadata": {"inclusion": "available"} }] + mdata = {"teams": mdata_slug, "team_members": mdata_slug} tap_github.get_all_teams(schemas, "tap-github", {}, mdata, "") - self.assertEquals(mocked_team_members.call_count, 1) + self.assertEqual(mocked_team_members.call_count, 1) @mock.patch("tap_github.__init__.get_all_team_members") def test_slug_sub_stream_not_selected_slug_not_selected(self, mocked_team_members, mocked_request): @@ -92,7 +94,7 @@ def test_slug_sub_stream_not_selected_slug_not_selected(self, mocked_team_member mocked_request.return_value = get_response(json) schemas = {"teams": "None"} - mdata =[ + mdata = {"teams": [ { 'breadcrumb': [], 'metadata': {'selected': True, 'table-key-properties': ['id']} @@ -104,9 +106,9 @@ def test_slug_sub_stream_not_selected_slug_not_selected(self, mocked_team_member { "breadcrumb": [ "properties", "name"], "metadata": {"inclusion": "available"} - }] + }]} tap_github.get_all_teams(schemas, "tap-github", {}, mdata, "") - self.assertEquals(mocked_team_members.call_count, 0) + self.assertEqual(mocked_team_members.call_count, 0) @mock.patch("tap_github.__init__.authed_get_all_pages") class TestKeyErrorUser(unittest.TestCase): @@ -118,7 +120,7 @@ def test_user_not_selected_in_stargazers(self, mocked_write_records, mocked_requ mocked_request.return_value = get_response(json) schemas = {"teams": "None"} - mdata =[ + mdata = [ { 'breadcrumb': [], 'metadata': {'selected': True, 'table-key-properties': ['user_id']} @@ -132,7 +134,7 @@ def test_user_not_selected_in_stargazers(self, mocked_write_records, mocked_requ "metadata": {"inclusion": "available"} }] tap_github.get_all_stargazers(schemas, "tap-github", {}, mdata, "") - self.assertEquals(mocked_write_records.call_count, 1) + self.assertEqual(mocked_write_records.call_count, 1) @mock.patch("singer.write_record") def test_user_selected_in_stargazers(self, mocked_write_records, mocked_request): @@ -141,7 +143,7 @@ def test_user_selected_in_stargazers(self, mocked_write_records, mocked_request) mocked_request.return_value = get_response(json) schemas = {"stargazers": "None"} - mdata =[ + mdata = [ { 'breadcrumb': [], 'metadata': {'selected': True, 'table-key-properties': ['user_id']} @@ -155,4 +157,4 @@ def test_user_selected_in_stargazers(self, mocked_write_records, mocked_request) "metadata": {"inclusion": "available"} }] tap_github.get_all_stargazers(schemas, "tap-github", {}, mdata, "") - self.assertEquals(mocked_write_records.call_count, 1) + self.assertEqual(mocked_write_records.call_count, 1) diff --git a/tests/unittests/test_rate_limit.py b/tests/unittests/test_rate_limit.py index d335a1e5..7fb01873 100644 --- a/tests/unittests/test_rate_limit.py +++ b/tests/unittests/test_rate_limit.py @@ -36,7 +36,7 @@ def test_rate_limit_exception(self, mocked_sleep): try: tap_github.rate_throttling(resp) except tap_github.RateLimitExceeded as e: - self.assertEquals(str(e), "API rate limit exceeded, please try after 601 seconds.") + self.assertEqual(str(e), "API rate limit exceeded, please try after 601 seconds.") def test_rate_limit_not_exceeded(self, mocked_sleep): diff --git a/tests/unittests/test_start_date_bookmark.py b/tests/unittests/test_start_date_bookmark.py index 6489b1d2..8cfb4b18 100644 --- a/tests/unittests/test_start_date_bookmark.py +++ b/tests/unittests/test_start_date_bookmark.py @@ -12,7 +12,7 @@ def test_no_bookmark_no_start_date(self, mocked_get_bookmark): bookmark_key = 'since' expected_bookmark_value = None - self.assertEquals(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) + self.assertEqual(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) def test_no_bookmark_yes_start_date(self, mocked_get_bookmark): # Start date is present and bookmark is not present then start date should be return. @@ -21,7 +21,7 @@ def test_no_bookmark_yes_start_date(self, mocked_get_bookmark): bookmark_key = 'since' expected_bookmark_value = '2021-04-01T00:00:00.000000Z' - self.assertEquals(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) + self.assertEqual(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) def test_yes_bookmark_yes_start_date(self, mocked_get_bookmark): # Start date and bookmark both are present then bookmark should be return. @@ -30,7 +30,7 @@ def test_yes_bookmark_yes_start_date(self, mocked_get_bookmark): bookmark_key = 'since' expected_bookmark_value = '2021-05-01T00:00:00.000000Z' - self.assertEquals(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) + self.assertEqual(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) def test_yes_bookmark_no_start_date(self, mocked_get_bookmark): # Start date is not present and bookmark is present then bookmark should be return. @@ -39,4 +39,4 @@ def test_yes_bookmark_no_start_date(self, mocked_get_bookmark): bookmark_key = 'since' expected_bookmark_value = '2021-05-01T00:00:00.000000Z' - self.assertEquals(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) + self.assertEqual(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) diff --git a/tests/unittests/test_sub_streams_selection.py b/tests/unittests/test_sub_streams_selection.py index 329f88b2..8dd16ff9 100644 --- a/tests/unittests/test_sub_streams_selection.py +++ b/tests/unittests/test_sub_streams_selection.py @@ -12,7 +12,7 @@ def test_pull_request_sub_streams_not_selected(self): try: tap_github.validate_dependencies(selected_streams) except tap_github.DependencyException as e: - self.assertEquals(str(e), "Unable to extract 'reviews' data, to receive 'reviews' data, you also need to select 'pull_requests'. Unable to extract 'pr_commits' data, to receive 'pr_commits' data, you also need to select 'pull_requests'.") + self.assertEqual(str(e), "Unable to extract 'reviews' data, to receive 'reviews' data, you also need to select 'pull_requests'. Unable to extract 'pr_commits' data, to receive 'pr_commits' data, you also need to select 'pull_requests'.") def test_teams_sub_streams_selected(self): selected_streams = ["teams", "team_members"] @@ -23,7 +23,7 @@ def test_teams_sub_streams_not_selected(self): try: tap_github.validate_dependencies(selected_streams) except tap_github.DependencyException as e: - self.assertEquals(str(e), "Unable to extract 'team_members' data, to receive 'team_members' data, you also need to select 'teams'.") + self.assertEqual(str(e), "Unable to extract 'team_members' data, to receive 'team_members' data, you also need to select 'teams'.") def test_projects_sub_streams_selected(self): selected_streams = ["projects", "project_cards"] @@ -34,7 +34,7 @@ def test_projects_sub_streams_not_selected(self): try: tap_github.validate_dependencies(selected_streams) except tap_github.DependencyException as e: - self.assertEquals(str(e), "Unable to extract 'project_columns' data, to receive 'project_columns' data, you also need to select 'projects'.") + self.assertEqual(str(e), "Unable to extract 'project_columns' data, to receive 'project_columns' data, you also need to select 'projects'.") def test_mixed_streams_positive(self): selected_streams = ["pull_requests", "reviews", "collaborators", "team_members", "stargazers", "projects", "teams", "project_cards"] @@ -45,4 +45,4 @@ def test_mixed_streams_negative(self): try: tap_github.validate_dependencies(selected_streams) except tap_github.DependencyException as e: - self.assertEquals(str(e), "Unable to extract 'review_comments' data, to receive 'review_comments' data, you also need to select 'pull_requests'.") + self.assertEqual(str(e), "Unable to extract 'review_comments' data, to receive 'review_comments' data, you also need to select 'pull_requests'.") diff --git a/tests/unittests/test_verify_access.py b/tests/unittests/test_verify_access.py index 72f868de..1e00df32 100644 --- a/tests/unittests/test_verify_access.py +++ b/tests/unittests/test_verify_access.py @@ -34,7 +34,7 @@ def test_repo_not_found(self, mocked_parse_args, mocked_request): try: tap_github.verify_repo_access("", "repo") except tap_github.NotFoundException as e: - self.assertEquals(str(e), "HTTP-error-code: 404, Error: Please check the repository name 'repo' or you do not have sufficient permissions to access this repository.") + self.assertEqual(str(e), "HTTP-error-code: 404, Error: Please check the repository name 'repo' or you do not have sufficient permissions to access this repository.") def test_repo_bad_request(self, mocked_parse_args, mocked_request): mocked_request.return_value = get_response(400, raise_error = True) @@ -42,7 +42,7 @@ def test_repo_bad_request(self, mocked_parse_args, mocked_request): try: tap_github.verify_repo_access("", "repo") except tap_github.BadRequestException as e: - self.assertEquals(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") + self.assertEqual(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") def test_repo_bad_creds(self, mocked_parse_args, mocked_request): json = {"message": "Bad credentials", "documentation_url": "https://docs.github.com/"} @@ -51,7 +51,7 @@ def test_repo_bad_creds(self, mocked_parse_args, mocked_request): try: tap_github.verify_repo_access("", "repo") except tap_github.BadCredentialsException as e: - self.assertEquals(str(e), "HTTP-error-code: 401, Error: {}".format(json)) + self.assertEqual(str(e), "HTTP-error-code: 401, Error: {}".format(json)) @mock.patch("tap_github.get_catalog") def test_discover_valid_creds(self, mocked_get_catalog, mocked_parse_args, mocked_request): @@ -71,8 +71,8 @@ def test_discover_not_found(self, mocked_get_catalog, mocked_parse_args, mocked_ try: tap_github.do_discover({"access_token": "access_token", "repository": "org/repo"}) except tap_github.NotFoundException as e: - self.assertEquals(str(e), "HTTP-error-code: 404, Error: Please check the repository name 'org/repo' or you do not have sufficient permissions to access this repository.") - self.assertEqual(mocked_get_catalog.call_count, 0) + self.assertEqual(str(e), "HTTP-error-code: 404, Error: Please check the repository name org/repo or you do not have sufficient permissions to access this repository.") + self.assertEqual(mocked_get_catalog.call_count, 1) @mock.patch("tap_github.get_catalog") def test_discover_bad_request(self, mocked_get_catalog, mocked_parse_args, mocked_request): @@ -82,7 +82,7 @@ def test_discover_bad_request(self, mocked_get_catalog, mocked_parse_args, mocke try: tap_github.do_discover({"access_token": "access_token", "repository": "org/repo"}) except tap_github.BadRequestException as e: - self.assertEquals(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") + self.assertEqual(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") self.assertEqual(mocked_get_catalog.call_count, 0) @mock.patch("tap_github.get_catalog") @@ -94,7 +94,7 @@ def test_discover_bad_creds(self, mocked_get_catalog, mocked_parse_args, mocked_ try: tap_github.do_discover({"access_token": "access_token", "repository": "org/repo"}) except tap_github.BadCredentialsException as e: - self.assertEquals(str(e), "HTTP-error-code: 401, Error: {}".format(json)) + self.assertEqual(str(e), "HTTP-error-code: 401, Error: {}".format(json)) self.assertEqual(mocked_get_catalog.call_count, 0) @mock.patch("tap_github.get_catalog") @@ -106,7 +106,7 @@ def test_discover_forbidden(self, mocked_get_catalog, mocked_parse_args, mocked_ try: tap_github.do_discover({"access_token": "access_token", "repository": "org/repo"}) except tap_github.AuthException as e: - self.assertEquals(str(e), "HTTP-error-code: 403, Error: {}".format(json)) + self.assertEqual(str(e), "HTTP-error-code: 403, Error: {}".format(json)) self.assertEqual(mocked_get_catalog.call_count, 0) @@ -123,5 +123,5 @@ def test_repo_call_count(self, mocked_repo, mocked_logger_info): config = {"access_token": "access_token", "repository": "org1/repo1 org1/repo2 org2/repo1"} tap_github.verify_access_for_repo(config) - self.assertEquals(mocked_logger_info.call_count, 3) - self.assertEquals(mocked_repo.call_count, 3) + self.assertEqual(mocked_logger_info.call_count, 3) + self.assertEqual(mocked_repo.call_count, 3) From f1ba2fd040817bebe7afe5b3f109ff9035f15b3c Mon Sep 17 00:00:00 2001 From: KrisPersonal <66801357+KrisPersonal@users.noreply.github.com> Date: Mon, 14 Feb 2022 20:31:36 +0530 Subject: [PATCH 02/30] Bump version for implementing wild card in Github Repository Name (#156) Co-authored-by: KrishnanG --- CHANGELOG.md | 4 ++++ setup.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 81608664..85e02259 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +# 1.10.3 + * Implemented wildcard implementation [#145] (https://github.com/singer-io/tap-github/pull/145) + * Added additional test coverage [#145] (https://github.com/singer-io/tap-github/pull/145) + # 1.10.2 * Added Request Timeout diff --git a/setup.py b/setup.py index 4f8a4836..c0dd7f4b 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='1.10.2', + version='1.10.3', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', From 3126c9dc125333451fff6004397c5bcb716c7b64 Mon Sep 17 00:00:00 2001 From: Andrey Kabanov Date: Wed, 23 Feb 2022 14:09:38 -0800 Subject: [PATCH 03/30] Fix team_members stream PK (#157) * add `team_slug` as a pk for team members stream * add `team_slug` to schema * add `team_slug` to team members PK in tap-tester * remove team members stream bug info --- tap_github/__init__.py | 3 ++- tap_github/schemas/team_members.json | 6 ++++++ tests/base.py | 2 +- tests/test_github_automatic_fields.py | 12 +++++------- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/tap_github/__init__.py b/tap_github/__init__.py index 1f4eebcc..5c2d768a 100644 --- a/tap_github/__init__.py +++ b/tap_github/__init__.py @@ -39,7 +39,7 @@ 'project_cards': ['id'], 'repos': ['id'], 'teams': ['id'], - 'team_members': ['id'], + 'team_members': ['id', 'team_slug'], 'team_memberships': ['url'] } @@ -442,6 +442,7 @@ def get_all_team_members(team_slug, schemas, repo_path, state, mdata): team_members = response.json() for r in team_members: r['_sdc_repository'] = repo_path + r['team_slug'] = team_slug # transform and write release record with singer.Transformer() as transformer: diff --git a/tap_github/schemas/team_members.json b/tap_github/schemas/team_members.json index 82b17bed..b707c5e3 100644 --- a/tap_github/schemas/team_members.json +++ b/tap_github/schemas/team_members.json @@ -117,6 +117,12 @@ "null", "string" ] + }, + "team_slug": { + "type": [ + "null", + "string" + ] } } } \ No newline at end of file diff --git a/tests/base.py b/tests/base.py index 6ea415a4..33c0478a 100644 --- a/tests/base.py +++ b/tests/base.py @@ -176,7 +176,7 @@ def expected_metadata(self): self.OBEYS_START_DATE: False }, "team_members": { - self.PRIMARY_KEYS: {"id"}, + self.PRIMARY_KEYS: {"id", "team_slug"}, self.REPLICATION_METHOD: self.FULL, self.OBEYS_START_DATE: False }, diff --git a/tests/test_github_automatic_fields.py b/tests/test_github_automatic_fields.py index 03ae904f..7a5bc759 100644 --- a/tests/test_github_automatic_fields.py +++ b/tests/test_github_automatic_fields.py @@ -64,10 +64,8 @@ def test_run(self): for actual_keys in record_messages_keys: self.assertSetEqual(expected_keys, actual_keys) - # BUG-TDL-17507 An org can have multiple teams with overlapping membership - if stream != 'team_members': - # Verify that all replicated records have unique primary key values. - self.assertEqual( - len(primary_keys_list), - len(unique_primary_keys_list), - msg="Replicated record does not have unique primary key values.") + # Verify that all replicated records have unique primary key values. + self.assertEqual( + len(primary_keys_list), + len(unique_primary_keys_list), + msg="Replicated record does not have unique primary key values.") From 4f7ba58e501bd5026979194a0c25f31d32d3ffe0 Mon Sep 17 00:00:00 2001 From: KrisPersonal <66801357+KrisPersonal@users.noreply.github.com> Date: Wed, 23 Feb 2022 15:08:00 -0800 Subject: [PATCH 04/30] Bump version to fix team_members primary key (#158) Co-authored-by: KrishnanG --- CHANGELOG.md | 3 +++ setup.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 85e02259..f2e645ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# 1.10.4 + * Fix team_members stream primary Key [#157] (https://github.com/singer-io/tap-github/pull/157) + # 1.10.3 * Implemented wildcard implementation [#145] (https://github.com/singer-io/tap-github/pull/145) * Added additional test coverage [#145] (https://github.com/singer-io/tap-github/pull/145) diff --git a/setup.py b/setup.py index c0dd7f4b..4b191385 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='1.10.3', + version='1.10.4', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', From 8973411ee2f2cf007fe74adcae203f116750180f Mon Sep 17 00:00:00 2001 From: Prijen Khokhani <88327452+prijendev@users.noreply.github.com> Date: Tue, 20 Sep 2022 18:55:39 +0530 Subject: [PATCH 05/30] Crest master (#176) * TDL-19555 added missing test cases and updated test cases according to code change (#167) * added missing test cases and updated test cases according to code change * removed tags stream as it is not yet added * Updated inturrupted sync assertion * Removed bookmark conversion to utc * updated inerrupted sync assertions * resolved review comments * updated sreams in parent-child test case * resolved review comments * resolved review comments * Updated start_date test * updated date format conversion in test * unskipped incremental child streams from assertion * updated base.py * resolved review comments * resolved review comment * update variable name * resolved review comment and updated print - LOGGER * TDL 19530 update dict based implementation to class based (#168) * added backoff for 5xx error and fixed indexerror when only passed org in config * added missing test cases and updated test cases according to code change * removed tags stream as it is not yet added * Updated inturrupted sync assertion * Removed bookmark conversion to utc * updated inerrupted sync assertions * resolved review comments * updated sreams in parent-child test case * Initial commit. * Removed unnecessary unit test cases. * updated code comments * Resolved pylint errors. * disabled pylint for get_child_records. * Added unittest cases. * Added filter params for commits and comments stream. * Updated unit test cases. * resolved review comments * resolved review comments * Updated start_date test * updated date format conversion in test * Updated bookmark logic for parent child streams. * Updated credentials check logic. * Updated unittest case. * Resolved PR comments and handled error for / in repository in config * added unittests for only / in config * Updated client.py to raise error for wrong org. * Added back creds check logic during object creation time. * handled the /repo and raised exception with a list * unskipped incremental child streams from assertion * Make pk of child automatic in parent stream. * updated unittests * Added logger message to print url for all streams. * updated base.py * Skipped 404 error for all streams. * Removed unnecessary __exit__ method from client * Updated doc link for each stream. * removed duplicates from extract_repos_from_config() and thrown warning message for duplicate repos * resolved PR comments and fixed pylint * Added __exit__ method in the Client object. * Added new line in the end of client.py * Updated schema for pr_commits. * resolved PR comments * Give warning message for invalid repo. * Added comments in the code. * resolved PR comments * resolved review comments * Raised error for invalid organization. * Updated logger message in client.py * Updated unittest case to use parameterized. * added a new TooManyRequests class error for 429 * resolved review comment * update variable name * resolved review comment and updated print - LOGGER * TDL 16573 sync teams at organization level (#173) * Initial commit. * Updated unittest case and resolved pylints. * Defined orgs streams in constant. * Updated extract_repo_from_config method. * Updated unit test cases. * Added back unittest cases. * TDL-19555 added missing test cases and updated test cases according to code change (#167) * added missing test cases and updated test cases according to code change * removed tags stream as it is not yet added * Updated inturrupted sync assertion * Removed bookmark conversion to utc * updated inerrupted sync assertions * resolved review comments * updated sreams in parent-child test case * resolved review comments * resolved review comments * Updated start_date test * updated date format conversion in test * unskipped incremental child streams from assertion * updated base.py * resolved review comments * resolved review comment * update variable name * resolved review comment and updated print - LOGGER * Reverted back changes of all field test case. Co-authored-by: NevilParikh14 <92399024+NevilParikh14@users.noreply.github.com> * TDL-5961 Support of custom domain (#172) * Initial commit. * Removed url from the stream class. * Updated readme and sample config. * Updated sync test to utilize custom domain config parameter `base_url`. * Updated test_stream unit test. * Removed duplicate comment. * Added support of empty string in custom domain. Co-authored-by: namrata270998 * TDL-12323 Implemented currently syncing for repos and streams (#171) * TDL-12323 Implemented currently syncing for repos and streams * resolved PR review comments * TDL-19980 fixed the bug * fixed the bug when any stream not selected, but the state writes currenctly_syncing * resolved PR comments * resolved pylint * fixed cci issue * fixed cci issues * fixing cci issue * resolved cci issues * fixed cci issue * made cci happy * updated one assertion and comments * Updated verify_access_for_repo method in client. * Resolved pylint. * Updated currently syncing for teams streams. (#174) Co-authored-by: namrata270998 Co-authored-by: NevilParikh14 Co-authored-by: NevilParikh14 <92399024+NevilParikh14@users.noreply.github.com> Co-authored-by: namrata270998 <75604662+namrata270998@users.noreply.github.com> * Tdl 20302 add query param for pull requests (#175) * Added query param for pull_requests streams. * Updated unit test for pull_requests. * Updated interrupted sync test case. * Updated tap-tester for interupted sync. * TDL-19526 added missing fields (#170) * added backoff for 5xx error and fixed indexerror when only passed org in config * added missing test cases and updated test cases according to code change * removed tags stream as it is not yet added * Updated inturrupted sync assertion * Removed bookmark conversion to utc * updated inerrupted sync assertions * resolved review comments * updated sreams in parent-child test case * Initial commit. * Removed unnecessary unit test cases. * updated code comments * Resolved pylint errors. * disabled pylint for get_child_records. * Added unittest cases. * Added filter params for commits and comments stream. * Updated unit test cases. * resolved review comments * resolved review comments * Updated start_date test * updated date format conversion in test * added missing fields * Updated bookmark logic for parent child streams. * Updated credentials check logic. * Updated unittest case. * Resolved PR comments and handled error for / in repository in config * added unittests for only / in config * Updated client.py to raise error for wrong org. * Added back creds check logic during object creation time. * removed extra formatting * improved formatting * handled the /repo and raised exception with a list * unskipped incremental child streams from assertion * Make pk of child automatic in parent stream. * updated unittests * Added logger message to print url for all streams. * updated base.py * Skipped 404 error for all streams. * Removed unnecessary __exit__ method from client * Updated doc link for each stream. * removed duplicates from extract_repos_from_config() and thrown warning message for duplicate repos * resolved PR comments and fixed pylint * resolved pr comments * Added __exit__ method in the Client object. * added more fields to shared schema * removed additionalproperties as duplicate * Added new line in the end of client.py * added user to shared schema * updated indentation * Resolved PR review comments * resolved pr comments * added more references * added ref for assignees * removed description * added ref schema to reactions * fixed type of total count in reactions * removed indentation from team_members and commits * Updated schema for pr_commits. * removed _sdc_repo from users.json * removed commits from shared schema and added the original schema back * resolved PR comments * Give warning message for invalid repo. * Added comments in the code. * updated indentation * updated ref * resolved PR comments * resolved review comments * Raised error for invalid organization. * Updated logger message in client.py * Updated unittest case to use parameterized. * added missing fields * removed shared schema for some fields * added missing fields * added missing fields * reverted the type of some fields * removed extra ',' from schema * added missing fields in the KNOWN_MISSING_FIELDS for all_fields * added a new TooManyRequests class error for 429 * added missing fields in the schema * added draft in missing fields * updated indentation * updated indentation level * removed extra fields from commit->committer * added author and stats in pr_commits * added misisng field in all_fields * updated the string schema type to obj as per api doc * resolved review comment * update variable name * added string in changed object type for backup * resolved review comment and updated print - LOGGER * used shared schema for performed_via_github_app and added ref back inissue_events stream * fixed typo * TDL-14359 added properties to issue assignees (#169) * added properties to issue assignees * updated issues schema * resolved review comments * added assignees in shared schema * updated assignee to user as same schema * deleted shared assignees * reverted back to number for some schemas * removed extra',' from schema * copied the base branch fields back to assignees schema * removed additionalProperties from user.json * updated indentation level * updated indentation level * TDL 16573 sync teams at organization level (#173) * Initial commit. * Updated unittest case and resolved pylints. * Defined orgs streams in constant. * Updated extract_repo_from_config method. * Updated unit test cases. * Added back unittest cases. * TDL-19555 added missing test cases and updated test cases according to code change (#167) * added missing test cases and updated test cases according to code change * removed tags stream as it is not yet added * Updated inturrupted sync assertion * Removed bookmark conversion to utc * updated inerrupted sync assertions * resolved review comments * updated sreams in parent-child test case * resolved review comments * resolved review comments * Updated start_date test * updated date format conversion in test * unskipped incremental child streams from assertion * updated base.py * resolved review comments * resolved review comment * update variable name * resolved review comment and updated print - LOGGER * Reverted back changes of all field test case. Co-authored-by: NevilParikh14 <92399024+NevilParikh14@users.noreply.github.com> * TDL-5961 Support of custom domain (#172) * Initial commit. * Removed url from the stream class. * Updated readme and sample config. * Updated sync test to utilize custom domain config parameter `base_url`. * Updated test_stream unit test. * Removed duplicate comment. * Added support of empty string in custom domain. Co-authored-by: namrata270998 * TDL-12323 Implemented currently syncing for repos and streams (#171) * TDL-12323 Implemented currently syncing for repos and streams * resolved PR review comments * TDL-19980 fixed the bug * fixed the bug when any stream not selected, but the state writes currenctly_syncing * resolved PR comments * resolved pylint * fixed cci issue * fixed cci issues * fixing cci issue * resolved cci issues * fixed cci issue * made cci happy * updated one assertion and comments * Updated verify_access_for_repo method in client. * Resolved pylint. * Updated currently syncing for teams streams. (#174) * made schema type in a single line * fixed cci issue * added date-time format for closed_at in issue_events.json Co-authored-by: NevilParikh14 Co-authored-by: prijendev Co-authored-by: Prijen Khokhani <88327452+prijendev@users.noreply.github.com> Co-authored-by: NevilParikh14 <92399024+NevilParikh14@users.noreply.github.com> * TDL-20336 custom domain which are not created as a part of Github results into key error (#177) * Raise exception if `X-RateLimit-Reset` is not found in the header. * Updated comment in the unit test. * TDL 20339 handle empty bookmark if all repos unselected for next sync. (#178) * Handle empty bookmark if all existing repos are unselected. * Added detailed comments and unit test. * Added more comments. * Updated variable name and comments. * Updated comment. * Updated logic to return existing state. * Updated variable name and function name in unit test. * Removed duplicate field in the team_membership schema. Co-authored-by: NevilParikh14 <92399024+NevilParikh14@users.noreply.github.com> Co-authored-by: namrata270998 Co-authored-by: NevilParikh14 Co-authored-by: namrata270998 <75604662+namrata270998@users.noreply.github.com> --- .circleci/config.yml | 2 +- MANIFEST.in | 1 + README.md | 15 +- config.sample.json | 3 +- tap_github/__init__.py | 1195 +--------- tap_github/client.py | 344 +++ tap_github/discover.py | 36 + tap_github/schema.py | 68 + tap_github/schemas/assignees.json | 51 + tap_github/schemas/collaborators.json | 54 + tap_github/schemas/comments.json | 134 +- tap_github/schemas/commit_comments.json | 172 +- tap_github/schemas/commits.json | 256 ++- tap_github/schemas/events.json | 1135 +++++++-- tap_github/schemas/issue_events.json | 2024 ++++++----------- tap_github/schemas/issue_labels.json | 47 +- tap_github/schemas/issue_milestones.json | 269 +-- tap_github/schemas/issues.json | 402 ++-- tap_github/schemas/pr_commits.json | 323 +++ tap_github/schemas/project_cards.json | 150 +- tap_github/schemas/project_columns.json | 52 +- tap_github/schemas/projects.json | 203 +- tap_github/schemas/pull_requests.json | 1564 ++++++++++++- tap_github/schemas/releases.json | 157 +- tap_github/schemas/review_comments.json | 70 +- tap_github/schemas/reviews.json | 39 +- .../schemas/shared/issue_permissions.json | 20 + .../shared/performed_via_github_app.json | 61 + .../schemas/shared/pull_permissions.json | 20 + tap_github/schemas/shared/reactions.json | 35 + tap_github/schemas/shared/user.json | 68 + tap_github/schemas/stargazers.json | 8 +- tap_github/schemas/team_members.json | 116 +- tap_github/schemas/team_memberships.json | 28 +- tap_github/schemas/teams.json | 74 +- tap_github/streams.py | 768 +++++++ tap_github/sync.py | 236 ++ tests/base.py | 88 +- tests/test_github_all_fields.py | 160 +- tests/test_github_automatic_fields.py | 31 +- tests/test_github_bookmarks.py | 117 +- tests/test_github_discovery.py | 39 +- tests/test_github_interrupted_sync.py | 172 ++ ...test_github_interrupted_sync_add_stream.py | 177 ++ ...t_github_interrupted_sync_remove_stream.py | 202 ++ tests/test_github_pagination.py | 78 +- .../test_github_parent_child_independednt.py | 48 + tests/test_github_start_date.py | 105 +- tests/test_github_sync.py | 8 + tests/unittests/test_currently_syncing.py | 114 + tests/unittests/test_custom_domain.py | 29 + tests/unittests/test_exception_handling.py | 169 +- .../test_extract_repos_from_config.py | 88 +- tests/unittests/test_formatting_dates.py | 120 - tests/unittests/test_get_all_repos.py | 63 +- .../test_get_streams_and_state_translate.py | 135 ++ tests/unittests/test_key_error.py | 160 -- tests/unittests/test_main.py | 103 + tests/unittests/test_rate_limit.py | 44 +- tests/unittests/test_stargazers_full_table.py | 14 - tests/unittests/test_start_date_bookmark.py | 42 - tests/unittests/test_stream.py | 189 ++ tests/unittests/test_sub_streams_selection.py | 48 - tests/unittests/test_sync.py | 168 ++ tests/unittests/test_sync_endpoint.py | 289 +++ tests/unittests/test_timeout.py | 179 +- tests/unittests/test_verify_access.py | 115 +- 67 files changed, 8793 insertions(+), 4701 deletions(-) create mode 100644 tap_github/client.py create mode 100644 tap_github/discover.py create mode 100644 tap_github/schema.py create mode 100644 tap_github/schemas/pr_commits.json create mode 100644 tap_github/schemas/shared/issue_permissions.json create mode 100644 tap_github/schemas/shared/performed_via_github_app.json create mode 100644 tap_github/schemas/shared/pull_permissions.json create mode 100644 tap_github/schemas/shared/reactions.json create mode 100644 tap_github/schemas/shared/user.json create mode 100644 tap_github/streams.py create mode 100644 tap_github/sync.py create mode 100644 tests/test_github_interrupted_sync.py create mode 100644 tests/test_github_interrupted_sync_add_stream.py create mode 100644 tests/test_github_interrupted_sync_remove_stream.py create mode 100644 tests/test_github_parent_child_independednt.py create mode 100644 tests/unittests/test_currently_syncing.py create mode 100644 tests/unittests/test_custom_domain.py delete mode 100644 tests/unittests/test_formatting_dates.py create mode 100644 tests/unittests/test_get_streams_and_state_translate.py delete mode 100644 tests/unittests/test_key_error.py create mode 100644 tests/unittests/test_main.py delete mode 100644 tests/unittests/test_stargazers_full_table.py delete mode 100644 tests/unittests/test_start_date_bookmark.py create mode 100644 tests/unittests/test_stream.py delete mode 100644 tests/unittests/test_sub_streams_selection.py create mode 100644 tests/unittests/test_sync.py create mode 100644 tests/unittests/test_sync_endpoint.py diff --git a/.circleci/config.yml b/.circleci/config.yml index a6969bf5..03f71d0a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -29,7 +29,7 @@ jobs: name: 'Unit Tests' command: | source /usr/local/share/virtualenvs/tap-github/bin/activate - pip install nose coverage + pip install nose coverage parameterized nosetests --with-coverage --cover-erase --cover-package=tap_github --cover-html-dir=htmlcov tests/unittests coverage html when: always diff --git a/MANIFEST.in b/MANIFEST.in index 374734bd..68d81181 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ include LICENSE include tap_github/schemas/*.json +include tap_github/schemas/shared/*.json diff --git a/README.md b/README.md index 3e956789..e8c4df01 100644 --- a/README.md +++ b/README.md @@ -53,15 +53,18 @@ This tap: 3. Create the config file Create a JSON file containing the start date, access token you just created - and the path to one or multiple repositories that you want to extract data from. Each repo path should be space delimited. The repo path is relative to - `https://github.com/`. For example the path for this repository is + and the path to one or multiple repositories that you want to extract data from. Each repo path should be space delimited. The repo path is relative to `"base_url"` + (Default: `https://github.com/`). For example the path for this repository is `singer-io/tap-github`. You can also add request timeout to set the timeout for requests which is an optional parameter with default value of 300 seconds. ```json - {"access_token": "your-access-token", - "repository": "singer-io/tap-github singer-io/getting-started", - "start_date": "2021-01-01T00:00:00Z", - "request_timeout": 300} + { + "access_token": "your-access-token", + "repository": "singer-io/tap-github singer-io/getting-started", + "start_date": "2021-01-01T00:00:00Z", + "request_timeout": 300, + "base_url": "https://api.github.com" + } ``` 4. Run the tap in discovery mode to get properties.json file diff --git a/config.sample.json b/config.sample.json index aff4e2f1..61df3707 100644 --- a/config.sample.json +++ b/config.sample.json @@ -2,5 +2,6 @@ "access_token": "abcdefghijklmnopqrstuvwxyz1234567890ABCD", "repository": "singer-io/target-stitch", "start_date": "2021-01-01T00:00:00Z", - "request_timeout": 300 + "request_timeout": 300, + "base_url": "https://api.github.com" } diff --git a/tap_github/__init__.py b/tap_github/__init__.py index 5c2d768a..cc93a061 100644 --- a/tap_github/__init__.py +++ b/tap_github/__init__.py @@ -1,1193 +1,42 @@ -import os import json -import collections -import time -import requests -import backoff +import sys import singer +from tap_github.discover import discover as _discover +from tap_github.client import GithubClient +from tap_github.sync import sync as _sync -from singer import (bookmarks, metrics, metadata) -from simplejson import JSONDecodeError - -session = requests.Session() -logger = singer.get_logger() - -# set default timeout of 300 seconds -REQUEST_TIMEOUT = 300 +LOGGER = singer.get_logger() REQUIRED_CONFIG_KEYS = ['start_date', 'access_token', 'repository'] -KEY_PROPERTIES = { - 'commits': ['sha'], - 'comments': ['id'], - 'issues': ['id'], - 'assignees': ['id'], - 'collaborators': ['id'], - 'pull_requests':['id'], - 'stargazers': ['user_id'], - 'releases': ['id'], - 'reviews': ['id'], - 'review_comments': ['id'], - 'pr_commits': ['id'], - 'events': ['id'], - 'issue_events': ['id'], - 'issue_labels': ['id'], - 'issue_milestones': ['id'], - 'commit_comments': ['id'], - 'projects': ['id'], - 'project_columns': ['id'], - 'project_cards': ['id'], - 'repos': ['id'], - 'teams': ['id'], - 'team_members': ['id', 'team_slug'], - 'team_memberships': ['url'] -} - -DEFAULT_SLEEP_SECONDS = 600 -MAX_SLEEP_SECONDS = DEFAULT_SLEEP_SECONDS - -class GithubException(Exception): - pass - -class BadCredentialsException(GithubException): - pass - -class AuthException(GithubException): - pass - -class NotFoundException(GithubException): - pass - -class BadRequestException(GithubException): - pass - -class InternalServerError(GithubException): - pass - -class UnprocessableError(GithubException): - pass - -class NotModifiedError(GithubException): - pass - -class MovedPermanentlyError(GithubException): - pass - -class ConflictError(GithubException): - pass - -class RateLimitExceeded(GithubException): - pass - -ERROR_CODE_EXCEPTION_MAPPING = { - 301: { - "raise_exception": MovedPermanentlyError, - "message": "The resource you are looking for is moved to another URL." - }, - 304: { - "raise_exception": NotModifiedError, - "message": "The requested resource has not been modified since the last time you accessed it." - }, - 400:{ - "raise_exception": BadRequestException, - "message": "The request is missing or has a bad parameter." - }, - 401: { - "raise_exception": BadCredentialsException, - "message": "Invalid authorization credentials." - }, - 403: { - "raise_exception": AuthException, - "message": "User doesn't have permission to access the resource." - }, - 404: { - "raise_exception": NotFoundException, - "message": "The resource you have specified cannot be found. Alternatively the access_token is not valid for the resource" - }, - 409: { - "raise_exception": ConflictError, - "message": "The request could not be completed due to a conflict with the current state of the server." - }, - 422: { - "raise_exception": UnprocessableError, - "message": "The request was not able to process right now." - }, - 500: { - "raise_exception": InternalServerError, - "message": "An error has occurred at Github's end." - } -} - -def translate_state(state, catalog, repositories): - ''' - This tap used to only support a single repository, in which case the - state took the shape of: - { - "bookmarks": { - "commits": { - "since": "2018-11-14T13:21:20.700360Z" - } - } - } - The tap now supports multiple repos, so this function should be called - at the beginning of each run to ensure the state is translate to the - new format: - { - "bookmarks": { - "singer-io/tap-adwords": { - "commits": { - "since": "2018-11-14T13:21:20.700360Z" - } - } - "singer-io/tap-salesforce": { - "commits": { - "since": "2018-11-14T13:21:20.700360Z" - } - } - } - } - ''' - nested_dict = lambda: collections.defaultdict(nested_dict) - new_state = nested_dict() - - for stream in catalog['streams']: - stream_name = stream['tap_stream_id'] - for repo in repositories: - if bookmarks.get_bookmark(state, repo, stream_name): - return state - if bookmarks.get_bookmark(state, stream_name, 'since'): - new_state['bookmarks'][repo][stream_name]['since'] = bookmarks.get_bookmark(state, stream_name, 'since') - - return new_state - - -def get_bookmark(state, repo, stream_name, bookmark_key, start_date): - repo_stream_dict = bookmarks.get_bookmark(state, repo, stream_name) - if repo_stream_dict: - return repo_stream_dict.get(bookmark_key) - if start_date: - return start_date - return None - -def raise_for_error(resp, source): - error_code = resp.status_code - try: - response_json = resp.json() - except JSONDecodeError: - response_json = {} - - if error_code == 404: - details = ERROR_CODE_EXCEPTION_MAPPING.get(error_code).get("message") - if source == "teams": - details += ' or it is a personal account repository' - message = "HTTP-error-code: 404, Error: {}. Please refer \'{}\' for more details.".format(details, response_json.get("documentation_url")) - logger.info(message) - # don't raise a NotFoundException - return None - - message = "HTTP-error-code: {}, Error: {}".format( - error_code, ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("message", "Unknown Error") if response_json == {} else response_json) - - exc = ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("raise_exception", GithubException) - raise exc(message) from None - -def calculate_seconds(epoch): - current = time.time() - return int(round((epoch - current), 0)) - -def rate_throttling(response): - if int(response.headers['X-RateLimit-Remaining']) == 0: - seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset'])) - - if seconds_to_sleep > MAX_SLEEP_SECONDS: - message = "API rate limit exceeded, please try after {} seconds.".format(seconds_to_sleep) - raise RateLimitExceeded(message) from None - - logger.info("API rate limit exceeded. Tap will retry the data collection after %s seconds.", seconds_to_sleep) - time.sleep(seconds_to_sleep) - -# pylint: disable=dangerous-default-value -# during 'Timeout' error there is also possibility of 'ConnectionError', -# hence added backoff for 'ConnectionError' too. -@backoff.on_exception(backoff.expo, (requests.Timeout, requests.ConnectionError), max_tries=5, factor=2) -def authed_get(source, url, headers={}): - with metrics.http_request_timer(source) as timer: - session.headers.update(headers) - resp = session.request(method='get', url=url, timeout=get_request_timeout()) - if resp.status_code != 200: - raise_for_error(resp, source) - timer.tags[metrics.Tag.http_status_code] = resp.status_code - rate_throttling(resp) - if resp.status_code == 404: - # return an empty response body since we're not raising a NotFoundException - resp._content = b'{}' # pylint: disable=protected-access - return resp - -def authed_get_all_pages(source, url, headers={}): - while True: - r = authed_get(source, url, headers) - yield r - if 'next' in r.links: - url = r.links['next']['url'] - else: - break - -def get_abs_path(path): - return os.path.join(os.path.dirname(os.path.realpath(__file__)), path) - - -def generate_pr_commit_schema(commit_schema): - pr_commit_schema = commit_schema.copy() - pr_commit_schema['properties']['pr_number'] = { - "type": ["null", "integer"] - } - pr_commit_schema['properties']['pr_id'] = { - "type": ["null", "string"] - } - pr_commit_schema['properties']['id'] = { - "type": ["null", "string"] - } - - return pr_commit_schema - -def load_schemas(): - schemas = {} - - for filename in os.listdir(get_abs_path('schemas')): - path = get_abs_path('schemas') + '/' + filename - file_raw = filename.replace('.json', '') - with open(path, encoding='utf-8') as file: - schemas[file_raw] = json.load(file) - - schemas['pr_commits'] = generate_pr_commit_schema(schemas['commits']) - return schemas - -class DependencyException(Exception): - pass - -def validate_dependencies(selected_stream_ids): - errs = [] - msg_tmpl = ("Unable to extract '{0}' data, " - "to receive '{0}' data, you also need to select '{1}'.") - - for main_stream, sub_streams in SUB_STREAMS.items(): - if main_stream not in selected_stream_ids: - for sub_stream in sub_streams: - if sub_stream in selected_stream_ids: - errs.append(msg_tmpl.format(sub_stream, main_stream)) - - if errs: - raise DependencyException(" ".join(errs)) - - -def write_metadata(mdata, values, breadcrumb): - mdata.append( - { - 'metadata': values, - 'breadcrumb': breadcrumb - } - ) - -def populate_metadata(schema_name, schema): - mdata = metadata.new() - #mdata = metadata.write(mdata, (), 'forced-replication-method', KEY_PROPERTIES[schema_name]) - mdata = metadata.write(mdata, (), 'table-key-properties', KEY_PROPERTIES[schema_name]) - - for field_name in schema['properties'].keys(): - if field_name in KEY_PROPERTIES[schema_name]: - mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') - else: - mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') - - return mdata - -def get_catalog(): - raw_schemas = load_schemas() - streams = [] - - for schema_name, schema in raw_schemas.items(): - - # get metadata for each field - mdata = populate_metadata(schema_name, schema) - - # create and add catalog entry - catalog_entry = { - 'stream': schema_name, - 'tap_stream_id': schema_name, - 'schema': schema, - 'metadata' : metadata.to_list(mdata), - 'key_properties': KEY_PROPERTIES[schema_name], - } - streams.append(catalog_entry) - - return {'streams': streams} - -def get_all_repos(organizations: list) -> list: +def do_discover(client): """ - Retrieves all repositories for the provided organizations and - verifies basic access for them. - - Docs: https://docs.github.com/en/rest/reference/repos#list-organization-repositories + Call the discovery function. """ - repos = [] - - for org_path in organizations: - org = org_path.split('/')[0] - for response in authed_get_all_pages( - 'get_all_repos', - 'https://api.github.com/orgs/{}/repos?sort=created&direction=desc'.format(org) - ): - org_repos = response.json() - - for repo in org_repos: - repo_full_name = repo.get('full_name') - - logger.info("Verifying access of repository: %s", repo_full_name) - verify_repo_access( - 'https://api.github.com/repos/{}/commits'.format(repo_full_name), - repo - ) + catalog = _discover(client) + # Dump catalog + json.dump(catalog, sys.stdout, indent=2) - repos.append(repo_full_name) - - return repos - -def extract_repos_from_config(config: dict ) -> list: +@singer.utils.handle_top_exception(LOGGER) +def main(): """ - Extracts all repositories from the config and calls get_all_repos() - for organizations using the wildcard 'org/*' format. + Run discover mode or sync mode. """ - repo_paths = list(filter(None, config['repository'].split(' '))) - - orgs_with_all_repos = list(filter(lambda x: x.split('/')[1] == '*', repo_paths)) - - if orgs_with_all_repos: - # remove any wildcard "org/*" occurrences from `repo_paths` - repo_paths = list(set(repo_paths).difference(set(orgs_with_all_repos))) - - # get all repositores for an org in the config - all_repos = get_all_repos(orgs_with_all_repos) - - # update repo_paths - repo_paths.extend(all_repos) - - return repo_paths - -def verify_repo_access(url_for_repo, repo): - try: - authed_get("verifying repository access", url_for_repo) - except NotFoundException: - # throwing user-friendly error message as it checks token access - message = "HTTP-error-code: 404, Error: Please check the repository name \'{}\' or you do not have sufficient permissions to access this repository.".format(repo) - raise NotFoundException(message) from None - -def verify_access_for_repo(config): - - access_token = config['access_token'] - session.headers.update({'authorization': 'token ' + access_token, 'per_page': '1', 'page': '1'}) - - repositories = extract_repos_from_config(config) - - for repo in repositories: - logger.info("Verifying access of repository: %s", repo) - - url_for_repo = "https://api.github.com/repos/{}/commits".format(repo) - - # Verifying for Repo access - verify_repo_access(url_for_repo, repo) - -def do_discover(config): - verify_access_for_repo(config) - catalog = get_catalog() - # dump catalog - print(json.dumps(catalog, indent=2)) - -def get_all_teams(schemas, repo_path, state, mdata, _start_date): - org = repo_path.split('/')[0] - with metrics.record_counter('teams') as counter: - for response in authed_get_all_pages( - 'teams', - 'https://api.github.com/orgs/{}/teams?sort=created_at&direction=desc'.format(org) - ): - teams = response.json() - extraction_time = singer.utils.now() - - for r in teams: - team_slug = r.get('slug') - r['_sdc_repository'] = repo_path - - # transform and write release record - with singer.Transformer() as transformer: - rec = transformer.transform(r, schemas['teams'], metadata=metadata.to_map(mdata['teams'])) - singer.write_record('teams', rec, time_extracted=extraction_time) - counter.increment() - - if schemas.get('team_members'): - for team_members_rec in get_all_team_members(team_slug, schemas['team_members'], repo_path, state, mdata['team_members']): - singer.write_record('team_members', team_members_rec, time_extracted=extraction_time) - - if schemas.get('team_memberships'): - for team_memberships_rec in get_all_team_memberships(team_slug, schemas['team_memberships'], repo_path, state, mdata['team_memberships']): - singer.write_record('team_memberships', team_memberships_rec, time_extracted=extraction_time) - - return state - -def get_all_team_members(team_slug, schemas, repo_path, state, mdata): - org = repo_path.split('/')[0] - with metrics.record_counter('team_members') as counter: - for response in authed_get_all_pages( - 'team_members', - 'https://api.github.com/orgs/{}/teams/{}/members?sort=created_at&direction=desc'.format(org, team_slug) - ): - team_members = response.json() - for r in team_members: - r['_sdc_repository'] = repo_path - r['team_slug'] = team_slug - - # transform and write release record - with singer.Transformer() as transformer: - rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) - counter.increment() - - yield rec - - return state - -def get_all_team_memberships(team_slug, schemas, repo_path, state, mdata): - org = repo_path.split('/')[0] - for response in authed_get_all_pages( - 'team_members', - 'https://api.github.com/orgs/{}/teams/{}/members?sort=created_at&direction=desc'.format(org, team_slug) - ): - team_members = response.json() - with metrics.record_counter('team_memberships') as counter: - for r in team_members: - username = r['login'] - for res in authed_get_all_pages( - 'memberships', - 'https://api.github.com/orgs/{}/teams/{}/memberships/{}'.format(org, team_slug, username) - ): - team_membership = res.json() - team_membership['_sdc_repository'] = repo_path - with singer.Transformer() as transformer: - rec = transformer.transform(team_membership, schemas, metadata=metadata.to_map(mdata)) - counter.increment() - yield rec - return state - - -def get_all_issue_events(schemas, repo_path, state, mdata, start_date): - bookmark_value = get_bookmark(state, repo_path, "issue_events", "since", start_date) - if bookmark_value: - bookmark_time = singer.utils.strptime_to_utc(bookmark_value) - else: - bookmark_time = 0 - - - with metrics.record_counter('issue_events') as counter: - for response in authed_get_all_pages( - 'issue_events', - 'https://api.github.com/repos/{}/issues/events?sort=created_at&direction=desc'.format(repo_path) - ): - events = response.json() - extraction_time = singer.utils.now() - for event in events: - event['_sdc_repository'] = repo_path - # skip records that haven't been updated since the last run - # the GitHub API doesn't currently allow a ?since param for pulls - # once we find the first piece of old data we can return, thanks to - # the sorting - updated_at = event.get('created_at') if event.get('updated_at') is None else event.get('updated_at') - if bookmark_time and singer.utils.strptime_to_utc(updated_at) < bookmark_time: - return state - - # transform and write release record - with singer.Transformer() as transformer: - rec = transformer.transform(event, schemas, metadata=metadata.to_map(mdata)) - singer.write_record('issue_events', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'issue_events', {'since': singer.utils.strftime(extraction_time)}) - counter.increment() - - return state - - -def get_all_events(schemas, repo_path, state, mdata, start_date): - # Incremental sync off `created_at` - # https://developer.github.com/v3/issues/events/#list-events-for-a-repository - # 'https://api.github.com/repos/{}/issues/events?sort=created_at&direction=desc'.format(repo_path) - - bookmark_value = get_bookmark(state, repo_path, "events", "since", start_date) - if bookmark_value: - bookmark_time = singer.utils.strptime_to_utc(bookmark_value) - else: - bookmark_time = 0 - - with metrics.record_counter('events') as counter: - for response in authed_get_all_pages( - 'events', - 'https://api.github.com/repos/{}/events?sort=created_at&direction=desc'.format(repo_path) - ): - events = response.json() - extraction_time = singer.utils.now() - for r in events: - r['_sdc_repository'] = repo_path - - # skip records that haven't been updated since the last run - # the GitHub API doesn't currently allow a ?since param for pulls - # once we find the first piece of old data we can return, thanks to - # the sorting - updated_at = r.get('created_at') if r.get('updated_at') is None else r.get('updated_at') - if bookmark_time and singer.utils.strptime_to_utc(updated_at) < bookmark_time: - return state - - # transform and write release record - with singer.Transformer() as transformer: - rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) - singer.write_record('events', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'events', {'since': singer.utils.strftime(extraction_time)}) - counter.increment() - - return state - -def get_all_issue_milestones(schemas, repo_path, state, mdata, start_date): - # Incremental sync off `due on` ??? confirm. - # https://developer.github.com/v3/issues/milestones/#list-milestones-for-a-repository - # 'https://api.github.com/repos/{}/milestones?sort=created_at&direction=desc'.format(repo_path) - bookmark_value = get_bookmark(state, repo_path, "issue_milestones", "since", start_date) - if bookmark_value: - bookmark_time = singer.utils.strptime_to_utc(bookmark_value) - else: - bookmark_time = 0 - - with metrics.record_counter('issue_milestones') as counter: - for response in authed_get_all_pages( - 'milestones', - 'https://api.github.com/repos/{}/milestones?direction=desc'.format(repo_path) - ): - milestones = response.json() - extraction_time = singer.utils.now() - for r in milestones: - r['_sdc_repository'] = repo_path - - # skip records that haven't been updated since the last run - # the GitHub API doesn't currently allow a ?since param for pulls - # once we find the first piece of old data we can return, thanks to - # the sorting - if bookmark_time and r.get("due_on") and singer.utils.strptime_to_utc(r.get("due_on")) < bookmark_time: - continue - - # transform and write release record - with singer.Transformer() as transformer: - rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) - singer.write_record('issue_milestones', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'issue_milestones', {'since': singer.utils.strftime(extraction_time)}) - counter.increment() - - return state - -def get_all_issue_labels(schemas, repo_path, state, mdata, _start_date): - # https://developer.github.com/v3/issues/labels/ - # not sure if incremental key - # 'https://api.github.com/repos/{}/labels?sort=created_at&direction=desc'.format(repo_path) - - with metrics.record_counter('issue_labels') as counter: - for response in authed_get_all_pages( - 'issue_labels', - 'https://api.github.com/repos/{}/labels'.format(repo_path) - ): - issue_labels = response.json() - extraction_time = singer.utils.now() - for r in issue_labels: - r['_sdc_repository'] = repo_path - - # transform and write release record - with singer.Transformer() as transformer: - rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) - singer.write_record('issue_labels', rec, time_extracted=extraction_time) - counter.increment() - - return state - -def get_all_commit_comments(schemas, repo_path, state, mdata, start_date): - # https://developer.github.com/v3/repos/comments/ - # updated_at? incremental - # 'https://api.github.com/repos/{}/comments?sort=created_at&direction=desc'.format(repo_path) - bookmark_value = get_bookmark(state, repo_path, "commit_comments", "since", start_date) - if bookmark_value: - bookmark_time = singer.utils.strptime_to_utc(bookmark_value) - else: - bookmark_time = 0 - - with metrics.record_counter('commit_comments') as counter: - for response in authed_get_all_pages( - 'commit_comments', - 'https://api.github.com/repos/{}/comments?sort=created_at&direction=desc'.format(repo_path) - ): - commit_comments = response.json() - extraction_time = singer.utils.now() - for r in commit_comments: - r['_sdc_repository'] = repo_path - - # skip records that haven't been updated since the last run - # the GitHub API doesn't currently allow a ?since param for pulls - # once we find the first piece of old data we can return, thanks to - # the sorting - if bookmark_time and singer.utils.strptime_to_utc(r.get('updated_at')) < bookmark_time: - return state - - # transform and write release record - with singer.Transformer() as transformer: - rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) - singer.write_record('commit_comments', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'commit_comments', {'since': singer.utils.strftime(extraction_time)}) - counter.increment() - - return state - -def get_all_projects(schemas, repo_path, state, mdata, start_date): - bookmark_value = get_bookmark(state, repo_path, "projects", "since", start_date) - if bookmark_value: - bookmark_time = singer.utils.strptime_to_utc(bookmark_value) - else: - bookmark_time = 0 - - with metrics.record_counter('projects') as counter: - #pylint: disable=too-many-nested-blocks - for response in authed_get_all_pages( - 'projects', - 'https://api.github.com/repos/{}/projects?sort=created_at&direction=desc'.format(repo_path), - { 'Accept': 'application/vnd.github.inertia-preview+json' } - ): - projects = response.json() - extraction_time = singer.utils.now() - for r in projects: - r['_sdc_repository'] = repo_path - - # skip records that haven't been updated since the last run - # the GitHub API doesn't currently allow a ?since param for pulls - # once we find the first piece of old data we can return, thanks to - # the sorting - if bookmark_time and singer.utils.strptime_to_utc(r.get('updated_at')) < bookmark_time: - return state - - # transform and write release record - with singer.Transformer() as transformer: - rec = transformer.transform(r, schemas['projects'], metadata=metadata.to_map(mdata['projects'])) - singer.write_record('projects', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'projects', {'since': singer.utils.strftime(extraction_time)}) - counter.increment() - - project_id = r.get('id') - - - - # sync project_columns if that schema is present (only there if selected) - if schemas.get('project_columns'): - for project_column_rec in get_all_project_columns(project_id, schemas['project_columns'], repo_path, state, mdata['project_columns'], start_date): - singer.write_record('project_columns', project_column_rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'project_columns', {'since': singer.utils.strftime(extraction_time)}) - - # sync project_cards if that schema is present (only there if selected) - if schemas.get('project_cards'): - column_id = project_column_rec['id'] - for project_card_rec in get_all_project_cards(column_id, schemas['project_cards'], repo_path, state, mdata['project_cards'], start_date): - singer.write_record('project_cards', project_card_rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'project_cards', {'since': singer.utils.strftime(extraction_time)}) - return state - - -def get_all_project_cards(column_id, schemas, repo_path, state, mdata, start_date): - bookmark_value = get_bookmark(state, repo_path, "project_cards", "since", start_date) - if bookmark_value: - bookmark_time = singer.utils.strptime_to_utc(bookmark_value) - else: - bookmark_time = 0 - - with metrics.record_counter('project_cards') as counter: - for response in authed_get_all_pages( - 'project_cards', - 'https://api.github.com/projects/columns/{}/cards?sort=created_at&direction=desc'.format(column_id) - ): - project_cards = response.json() - for r in project_cards: - r['_sdc_repository'] = repo_path - - # skip records that haven't been updated since the last run - # the GitHub API doesn't currently allow a ?since param for pulls - # once we find the first piece of old data we can return, thanks to - # the sorting - if bookmark_time and singer.utils.strptime_to_utc(r.get('updated_at')) < bookmark_time: - return state - - # transform and write release record - with singer.Transformer() as transformer: - rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) - counter.increment() - yield rec - - return state - -def get_all_project_columns(project_id, schemas, repo_path, state, mdata, start_date): - bookmark_value = get_bookmark(state, repo_path, "project_columns", "since", start_date) - if bookmark_value: - bookmark_time = singer.utils.strptime_to_utc(bookmark_value) - else: - bookmark_time = 0 - - with metrics.record_counter('project_columns') as counter: - for response in authed_get_all_pages( - 'project_columns', - 'https://api.github.com/projects/{}/columns?sort=created_at&direction=desc'.format(project_id) - ): - project_columns = response.json() - for r in project_columns: - r['_sdc_repository'] = repo_path - - # skip records that haven't been updated since the last run - # the GitHub API doesn't currently allow a ?since param for pulls - # once we find the first piece of old data we can return, thanks to - # the sorting - if bookmark_time and singer.utils.strptime_to_utc(r.get('updated_at')) < bookmark_time: - return state - - # transform and write release record - with singer.Transformer() as transformer: - rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) - counter.increment() - yield rec - - return state - -def get_all_releases(schemas, repo_path, state, mdata, _start_date): - # Releases doesn't seem to have an `updated_at` property, yet can be edited. - # For this reason and since the volume of release can safely be considered low, - # bookmarks were ignored for releases. - - with metrics.record_counter('releases') as counter: - for response in authed_get_all_pages( - 'releases', - 'https://api.github.com/repos/{}/releases?sort=created_at&direction=desc'.format(repo_path) - ): - releases = response.json() - extraction_time = singer.utils.now() - for r in releases: - r['_sdc_repository'] = repo_path - - # transform and write release record - with singer.Transformer() as transformer: - rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) - singer.write_record('releases', rec, time_extracted=extraction_time) - counter.increment() - - return state - -def get_all_pull_requests(schemas, repo_path, state, mdata, start_date): - ''' - https://developer.github.com/v3/pulls/#list-pull-requests - ''' - - bookmark_value = get_bookmark(state, repo_path, "pull_requests", "since", start_date) - if bookmark_value: - bookmark_time = singer.utils.strptime_to_utc(bookmark_value) - else: - bookmark_time = 0 - - with metrics.record_counter('pull_requests') as counter: - with metrics.record_counter('reviews') as reviews_counter: - for response in authed_get_all_pages( - 'pull_requests', - 'https://api.github.com/repos/{}/pulls?state=all&sort=updated&direction=desc'.format(repo_path) - ): - pull_requests = response.json() - extraction_time = singer.utils.now() - for pr in pull_requests: - - - # skip records that haven't been updated since the last run - # the GitHub API doesn't currently allow a ?since param for pulls - # once we find the first piece of old data we can return, thanks to - # the sorting - if bookmark_time and singer.utils.strptime_to_utc(pr.get('updated_at')) < bookmark_time: - return state - - pr_num = pr.get('number') - pr_id = pr.get('id') - pr['_sdc_repository'] = repo_path - - # transform and write pull_request record - with singer.Transformer() as transformer: - rec = transformer.transform(pr, schemas['pull_requests'], metadata=metadata.to_map(mdata['pull_requests'])) - singer.write_record('pull_requests', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'pull_requests', {'since': singer.utils.strftime(extraction_time)}) - counter.increment() - - # sync reviews if that schema is present (only there if selected) - if schemas.get('reviews'): - for review_rec in get_reviews_for_pr(pr_num, schemas['reviews'], repo_path, state, mdata['reviews']): - singer.write_record('reviews', review_rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'reviews', {'since': singer.utils.strftime(extraction_time)}) - - reviews_counter.increment() - - # sync review comments if that schema is present (only there if selected) - if schemas.get('review_comments'): - for review_comment_rec in get_review_comments_for_pr(pr_num, schemas['review_comments'], repo_path, state, mdata['review_comments']): - singer.write_record('review_comments', review_comment_rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'review_comments', {'since': singer.utils.strftime(extraction_time)}) - - if schemas.get('pr_commits'): - for pr_commit in get_commits_for_pr( - pr_num, - pr_id, - schemas['pr_commits'], - repo_path, - state, - mdata['pr_commits'] - ): - singer.write_record('pr_commits', pr_commit, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'pr_commits', {'since': singer.utils.strftime(extraction_time)}) - - return state - -def get_reviews_for_pr(pr_number, schema, repo_path, state, mdata): - for response in authed_get_all_pages( - 'reviews', - 'https://api.github.com/repos/{}/pulls/{}/reviews'.format(repo_path,pr_number) - ): - reviews = response.json() - for review in reviews: - review['_sdc_repository'] = repo_path - with singer.Transformer() as transformer: - rec = transformer.transform(review, schema, metadata=metadata.to_map(mdata)) - yield rec - - - return state - -def get_review_comments_for_pr(pr_number, schema, repo_path, state, mdata): - for response in authed_get_all_pages( - 'comments', - 'https://api.github.com/repos/{}/pulls/{}/comments'.format(repo_path,pr_number) - ): - review_comments = response.json() - for comment in review_comments: - comment['_sdc_repository'] = repo_path - with singer.Transformer() as transformer: - rec = transformer.transform(comment, schema, metadata=metadata.to_map(mdata)) - yield rec - - - return state - -def get_commits_for_pr(pr_number, pr_id, schema, repo_path, state, mdata): - for response in authed_get_all_pages( - 'pr_commits', - 'https://api.github.com/repos/{}/pulls/{}/commits'.format(repo_path,pr_number) - ): - - commit_data = response.json() - for commit in commit_data: - commit['_sdc_repository'] = repo_path - commit['pr_number'] = pr_number - commit['pr_id'] = pr_id - commit['id'] = '{}-{}'.format(pr_id, commit['sha']) - with singer.Transformer() as transformer: - rec = transformer.transform(commit, schema, metadata=metadata.to_map(mdata)) - yield rec - - return state - - -def get_all_assignees(schema, repo_path, state, mdata, _start_date): - ''' - https://developer.github.com/v3/issues/assignees/#list-assignees - ''' - with metrics.record_counter('assignees') as counter: - for response in authed_get_all_pages( - 'assignees', - 'https://api.github.com/repos/{}/assignees'.format(repo_path) - ): - assignees = response.json() - extraction_time = singer.utils.now() - for assignee in assignees: - assignee['_sdc_repository'] = repo_path - with singer.Transformer() as transformer: - rec = transformer.transform(assignee, schema, metadata=metadata.to_map(mdata)) - singer.write_record('assignees', rec, time_extracted=extraction_time) - counter.increment() - - return state - -def get_all_collaborators(schema, repo_path, state, mdata, _start_date): - ''' - https://developer.github.com/v3/repos/collaborators/#list-collaborators - ''' - with metrics.record_counter('collaborators') as counter: - try: - responses = authed_get_all_pages( - 'collaborators', - 'https://api.github.com/repos/{}/collaborators'.format(repo_path) - ) - except NotFoundException as error: - logger.info( - 'Unable to retreive collaborators stream, check access_token is valid for %s. See full error message: %s', - repo_path, error - ) - else: - for response in responses: - collaborators = response.json() - extraction_time = singer.utils.now() - for collaborator in collaborators: - collaborator['_sdc_repository'] = repo_path - with singer.Transformer() as transformer: - rec = transformer.transform(collaborator, schema, metadata=metadata.to_map(mdata)) - singer.write_record('collaborators', rec, time_extracted=extraction_time) - counter.increment() - - return state - -def get_all_commits(schema, repo_path, state, mdata, start_date): - ''' - https://developer.github.com/v3/repos/commits/#list-commits-on-a-repository - ''' - bookmark = get_bookmark(state, repo_path, "commits", "since", start_date) - if bookmark: - query_string = '?since={}'.format(bookmark) - else: - query_string = '' - - with metrics.record_counter('commits') as counter: - for response in authed_get_all_pages( - 'commits', - 'https://api.github.com/repos/{}/commits{}'.format(repo_path, query_string) - ): - commits = response.json() - extraction_time = singer.utils.now() - for commit in commits: - commit['_sdc_repository'] = repo_path - with singer.Transformer() as transformer: - rec = transformer.transform(commit, schema, metadata=metadata.to_map(mdata)) - singer.write_record('commits', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'commits', {'since': singer.utils.strftime(extraction_time)}) - counter.increment() - - return state - -def get_all_issues(schema, repo_path, state, mdata, start_date): - ''' - https://developer.github.com/v3/issues/#list-issues-for-a-repository - ''' - - bookmark = get_bookmark(state, repo_path, "issues", "since", start_date) - if bookmark: - query_string = '&since={}'.format(bookmark) - else: - query_string = '' - - with metrics.record_counter('issues') as counter: - for response in authed_get_all_pages( - 'issues', - 'https://api.github.com/repos/{}/issues?state=all&sort=updated&direction=asc{}'.format(repo_path, query_string) - ): - issues = response.json() - extraction_time = singer.utils.now() - for issue in issues: - issue['_sdc_repository'] = repo_path - with singer.Transformer() as transformer: - rec = transformer.transform(issue, schema, metadata=metadata.to_map(mdata)) - singer.write_record('issues', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'issues', {'since': singer.utils.strftime(extraction_time)}) - counter.increment() - return state - -def get_all_comments(schema, repo_path, state, mdata, start_date): - ''' - https://developer.github.com/v3/issues/comments/#list-comments-in-a-repository - ''' - - bookmark = get_bookmark(state, repo_path, "comments", "since", start_date) - if bookmark: - query_string = '&since={}'.format(bookmark) - else: - query_string = '' - - with metrics.record_counter('comments') as counter: - for response in authed_get_all_pages( - 'comments', - 'https://api.github.com/repos/{}/issues/comments?sort=updated&direction=asc{}'.format(repo_path, query_string) - ): - comments = response.json() - extraction_time = singer.utils.now() - for comment in comments: - comment['_sdc_repository'] = repo_path - with singer.Transformer() as transformer: - rec = transformer.transform(comment, schema, metadata=metadata.to_map(mdata)) - singer.write_record('comments', rec, time_extracted=extraction_time) - singer.write_bookmark(state, repo_path, 'comments', {'since': singer.utils.strftime(extraction_time)}) - counter.increment() - return state - -def get_all_stargazers(schema, repo_path, state, mdata, _start_date): - ''' - https://developer.github.com/v3/activity/starring/#list-stargazers - ''' - - stargazers_headers = {'Accept': 'application/vnd.github.v3.star+json'} - - with metrics.record_counter('stargazers') as counter: - for response in authed_get_all_pages( - 'stargazers', - 'https://api.github.com/repos/{}/stargazers'.format(repo_path), stargazers_headers - ): - stargazers = response.json() - extraction_time = singer.utils.now() - for stargazer in stargazers: - user_id = stargazer['user']['id'] - stargazer['_sdc_repository'] = repo_path - with singer.Transformer() as transformer: - rec = transformer.transform(stargazer, schema, metadata=metadata.to_map(mdata)) - rec['user_id'] = user_id - singer.write_record('stargazers', rec, time_extracted=extraction_time) - counter.increment() - - return state - -def get_selected_streams(catalog): - ''' - Gets selected streams. Checks schema's 'selected' - first -- and then checks metadata, looking for an empty - breadcrumb and mdata with a 'selected' entry - ''' - selected_streams = [] - for stream in catalog['streams']: - stream_metadata = stream['metadata'] - if stream['schema'].get('selected', False): - selected_streams.append(stream['tap_stream_id']) - else: - for entry in stream_metadata: - # stream metadata will have empty breadcrumb - if not entry['breadcrumb'] and entry['metadata'].get('selected',None): - selected_streams.append(stream['tap_stream_id']) - - return selected_streams - -def get_stream_from_catalog(stream_id, catalog): - for stream in catalog['streams']: - if stream['tap_stream_id'] == stream_id: - return stream - return None - -# return the 'timeout' -def get_request_timeout(): - args = singer.utils.parse_args([]) - # get the value of request timeout from config - config_request_timeout = args.config.get('request_timeout') - - # only return the timeout value if it is passed in the config and the value is not 0, "0" or "" - if config_request_timeout and float(config_request_timeout): - # return the timeout from config - return float(config_request_timeout) - - # return default timeout - return REQUEST_TIMEOUT - -SYNC_FUNCTIONS = { - 'commits': get_all_commits, - 'comments': get_all_comments, - 'issues': get_all_issues, - 'assignees': get_all_assignees, - 'collaborators': get_all_collaborators, - 'pull_requests': get_all_pull_requests, - 'releases': get_all_releases, - 'stargazers': get_all_stargazers, - 'events': get_all_events, - 'issue_events': get_all_issue_events, - 'issue_milestones': get_all_issue_milestones, - 'issue_labels': get_all_issue_labels, - 'projects': get_all_projects, - 'commit_comments': get_all_commit_comments, - 'teams': get_all_teams -} - -SUB_STREAMS = { - 'pull_requests': ['reviews', 'review_comments', 'pr_commits'], - 'projects': ['project_cards', 'project_columns'], - 'teams': ['team_members', 'team_memberships'] -} - -def do_sync(config, state, catalog): - access_token = config['access_token'] - session.headers.update({'authorization': 'token ' + access_token}) - - start_date = config['start_date'] if 'start_date' in config else None - # get selected streams, make sure stream dependencies are met - selected_stream_ids = get_selected_streams(catalog) - validate_dependencies(selected_stream_ids) - - repositories = extract_repos_from_config(config) - - state = translate_state(state, catalog, repositories) - singer.write_state(state) - - #pylint: disable=too-many-nested-blocks - for repo in repositories: - logger.info("Starting sync of repository: %s", repo) - for stream in catalog['streams']: - stream_id = stream['tap_stream_id'] - stream_schema = stream['schema'] - mdata = stream['metadata'] - - # if it is a "sub_stream", it will be sync'd by its parent - if not SYNC_FUNCTIONS.get(stream_id): - continue - - # if stream is selected, write schema and sync - if stream_id in selected_stream_ids: - singer.write_schema(stream_id, stream_schema, stream['key_properties']) - - # get sync function and any sub streams - sync_func = SYNC_FUNCTIONS[stream_id] - sub_stream_ids = SUB_STREAMS.get(stream_id, None) - - # sync stream - if not sub_stream_ids: - state = sync_func(stream_schema, repo, state, mdata, start_date) - - # handle streams with sub streams - else: - stream_schemas = {stream_id: stream_schema} - stream_mdata = {stream_id: mdata} - - # get and write selected sub stream schemas - for sub_stream_id in sub_stream_ids: - if sub_stream_id in selected_stream_ids: - sub_stream = get_stream_from_catalog(sub_stream_id, catalog) - stream_schemas[sub_stream_id] = sub_stream['schema'] - stream_mdata[sub_stream_id] = sub_stream['metadata'] - singer.write_schema(sub_stream_id, sub_stream['schema'], - sub_stream['key_properties']) - - # sync stream and it's sub streams - state = sync_func(stream_schemas, repo, state, stream_mdata, start_date) - - singer.write_state(state) - -@singer.utils.handle_top_exception(logger) -def main(): args = singer.utils.parse_args(REQUIRED_CONFIG_KEYS) - # get optional config key `max_sleep_seconds` - config_max_sleep = args.config.get('max_sleep_seconds') + config = args.config + + client = GithubClient(config) - # set global `MAX_SLEEP_SECONDS` for rate_throttling function or use default - global MAX_SLEEP_SECONDS #pylint: disable=global-statement - MAX_SLEEP_SECONDS = config_max_sleep if config_max_sleep else DEFAULT_SLEEP_SECONDS + state = {} + if args.state: + state = args.state if args.discover: - do_discover(args.config) + do_discover(client) else: - catalog = args.properties if args.properties else get_catalog() - do_sync(args.config, args.state, catalog) + catalog = args.properties if args.properties else _discover(client) + _sync(client, config, state, catalog) if __name__ == '__main__': main() diff --git a/tap_github/client.py b/tap_github/client.py new file mode 100644 index 00000000..9913a8c2 --- /dev/null +++ b/tap_github/client.py @@ -0,0 +1,344 @@ +import time +import requests +import backoff +from simplejson import JSONDecodeError +import singer +from singer import metrics + +LOGGER = singer.get_logger() +DEFAULT_SLEEP_SECONDS = 600 +DEFAULT_DOMAIN = "https://api.github.com" + +# Set default timeout of 300 seconds +REQUEST_TIMEOUT = 300 + +class GithubException(Exception): + pass + +class Server5xxError(GithubException): + pass + +class BadCredentialsException(GithubException): + pass + +class AuthException(GithubException): + pass + +class NotFoundException(GithubException): + pass + +class BadRequestException(GithubException): + pass + +class InternalServerError(Server5xxError): + pass + +class UnprocessableError(GithubException): + pass + +class NotModifiedError(GithubException): + pass + +class MovedPermanentlyError(GithubException): + pass + +class ConflictError(GithubException): + pass + +class RateLimitExceeded(GithubException): + pass + +class TooManyRequests(GithubException): + pass + + +ERROR_CODE_EXCEPTION_MAPPING = { + 301: { + "raise_exception": MovedPermanentlyError, + "message": "The resource you are looking for is moved to another URL." + }, + 304: { + "raise_exception": NotModifiedError, + "message": "The requested resource has not been modified since the last time you accessed it." + }, + 400:{ + "raise_exception": BadRequestException, + "message": "The request is missing or has a bad parameter." + }, + 401: { + "raise_exception": BadCredentialsException, + "message": "Invalid authorization credentials." + }, + 403: { + "raise_exception": AuthException, + "message": "User doesn't have permission to access the resource." + }, + 404: { + "raise_exception": NotFoundException, + "message": "The resource you have specified cannot be found. Alternatively the access_token is not valid for the resource" + }, + 409: { + "raise_exception": ConflictError, + "message": "The request could not be completed due to a conflict with the current state of the server." + }, + 422: { + "raise_exception": UnprocessableError, + "message": "The request was not able to process right now." + }, + 429: { + "raise_exception": TooManyRequests, + "message": "Too many requests occurred." + }, + 500: { + "raise_exception": InternalServerError, + "message": "An error has occurred at Github's end." + } +} + +def raise_for_error(resp, source, stream, client, should_skip_404): + """ + Retrieve the error code and the error message from the response and return custom exceptions accordingly. + """ + error_code = resp.status_code + try: + response_json = resp.json() + except JSONDecodeError: + response_json = {} + + if error_code == 404 and should_skip_404: + # Add not accessible stream into list. + client.not_accessible_repos.add(stream) + details = ERROR_CODE_EXCEPTION_MAPPING.get(error_code).get("message") + if source == "teams": + details += ' or it is a personal account repository' + message = "HTTP-error-code: 404, Error: {}. Please refer \'{}\' for more details.".format(details, response_json.get("documentation_url")) + LOGGER.warning(message) + # Don't raise a NotFoundException + return None + + message = "HTTP-error-code: {}, Error: {}".format( + error_code, ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("message", "Unknown Error") if response_json == {} else response_json) + + if error_code > 500: + raise Server5xxError(message) from None + + exc = ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("raise_exception", GithubException) + raise exc(message) from None + +def calculate_seconds(epoch): + """ + Calculate the seconds to sleep before making a new request. + """ + current = time.time() + return int(round((epoch - current), 0)) + +def rate_throttling(response, max_sleep_seconds): + """ + For rate limit errors, get the remaining time before retrying and calculate the time to sleep before making a new request. + """ + if 'X-RateLimit-Remaining' in response.headers: + if int(response.headers['X-RateLimit-Remaining']) == 0: + seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset'])) + + if seconds_to_sleep > max_sleep_seconds: + message = "API rate limit exceeded, please try after {} seconds.".format(seconds_to_sleep) + raise RateLimitExceeded(message) from None + + LOGGER.info("API rate limit exceeded. Tap will retry the data collection after %s seconds.", seconds_to_sleep) + time.sleep(seconds_to_sleep) + else: + # Raise an exception if `X-RateLimit-Remaining` is not found in the header. + # API does include this key header if provided base URL is not a valid github custom domain. + raise GithubException("The API call using the specified base url was unsuccessful. Please double-check the provided base URL.") + +class GithubClient: + """ + The client class used for making REST calls to the Github API. + """ + def __init__(self, config): + self.config = config + self.session = requests.Session() + self.base_url = config['base_url'] if config.get('base_url') else DEFAULT_DOMAIN + self.max_sleep_seconds = self.config.get('max_sleep_seconds', DEFAULT_SLEEP_SECONDS) + self.set_auth_in_session() + self.not_accessible_repos = set() + + def get_request_timeout(self): + """ + Get the request timeout from the config, if not present use the default 300 seconds. + """ + # Get the value of request timeout from config + config_request_timeout = self.config.get('request_timeout') + + # Only return the timeout value if it is passed in the config and the value is not 0, "0" or "" + if config_request_timeout and float(config_request_timeout): + return float(config_request_timeout) + + # Return default timeout + return REQUEST_TIMEOUT + + def set_auth_in_session(self): + """ + Set access token in the header for authorization. + """ + access_token = self.config['access_token'] + self.session.headers.update({'authorization': 'token ' + access_token}) + + # pylint: disable=dangerous-default-value + # During 'Timeout' error there is also possibility of 'ConnectionError', + # hence added backoff for 'ConnectionError' too. + @backoff.on_exception(backoff.expo, (requests.Timeout, requests.ConnectionError, Server5xxError, TooManyRequests), max_tries=5, factor=2) + def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True): + """ + Call rest API and return the response in case of status code 200. + """ + with metrics.http_request_timer(source) as timer: + self.session.headers.update(headers) + resp = self.session.request(method='get', url=url, timeout=self.get_request_timeout()) + if resp.status_code != 200: + raise_for_error(resp, source, stream, self, should_skip_404) + timer.tags[metrics.Tag.http_status_code] = resp.status_code + rate_throttling(resp, self.max_sleep_seconds) + if resp.status_code == 404: + # Return an empty response body since we're not raising a NotFoundException + resp._content = b'{}' # pylint: disable=protected-access + return resp + + def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True): + """ + Fetch all pages of records and return them. + """ + while True: + r = self.authed_get(source, url, headers, stream, should_skip_404) + yield r + + # Fetch the next page if next found in the response. + if 'next' in r.links: + url = r.links['next']['url'] + else: + # Break the loop if all pages are fetched. + break + + def verify_repo_access(self, url_for_repo, repo): + """ + Call rest API to verify that the user has sufficient permissions to access this repository. + """ + try: + self.authed_get("verifying repository access", url_for_repo) + except NotFoundException: + # Throwing user-friendly error message as it checks token access + message = "HTTP-error-code: 404, Error: Please check the repository name \'{}\' or you do not have sufficient permissions to access this repository.".format(repo) + raise NotFoundException(message) from None + + def verify_access_for_repo(self): + """ + For all the repositories mentioned in the config, check the access for each repos. + """ + repositories, org = self.extract_repos_from_config() # pylint: disable=unused-variable + + for repo in repositories: + + url_for_repo = "{}/repos/{}/commits".format(self.base_url, repo) + LOGGER.info("Verifying access of repository: %s", repo) + + # Verifying for Repo access + self.verify_repo_access(url_for_repo, repo) + + def extract_orgs_from_config(self): + """ + Extracts all organizations from the config + """ + repo_paths = list(filter(None, self.config['repository'].split(' '))) + orgs_paths = [repo.split('/')[0] for repo in repo_paths] + + return set(orgs_paths) + + def extract_repos_from_config(self): + """ + Extracts all repositories from the config and calls get_all_repos() + for organizations using the wildcard 'org/*' format. + """ + repo_paths = list(filter(None, self.config['repository'].split(' '))) + + unique_repos = set() + # Insert the duplicate repos found in the config repo_paths into duplicates + duplicate_repos = [x for x in repo_paths if x in unique_repos or (unique_repos.add(x) or False)] + if duplicate_repos: + LOGGER.warning("Duplicate repositories found: %s and will be synced only once.", duplicate_repos) + + repo_paths = list(set(repo_paths)) + + orgs_with_all_repos = [] + orgs = [] + repos_with_errors = [] + for repo in repo_paths: + # Split the repo_path by `/` as we are passing org/repo_name in the config. + split_repo_path = repo.split('/') + # Prepare list of organizations + orgs.append(split_repo_path[0]) + # Check for the second element in the split list only if the length is greater than 1 and the first/second + # element is not empty (for scenarios such as: `org/` or `/repo` which is invalid) + if len(split_repo_path) > 1 and split_repo_path[1] != '' and split_repo_path[0] != '': + # If the second element is *, we need to check access for all the repos. + if split_repo_path[1] == '*': + orgs_with_all_repos.append(repo) + else: + # If `/`/repo name/organization not found, append the repo_path in the repos_with_errors + repos_with_errors.append(repo) + + # If any repos found in repos_with_errors, raise an exception + if repos_with_errors: + raise GithubException("Please provide valid organization/repository for: {}".format(sorted(repos_with_errors))) + + if orgs_with_all_repos: + # Remove any wildcard "org/*" occurrences from `repo_paths` + repo_paths = list(set(repo_paths).difference(set(orgs_with_all_repos))) + + # Get all repositories for an org in the config + all_repos = self.get_all_repos(orgs_with_all_repos) + + # Update repo_paths + repo_paths.extend(all_repos) + + return repo_paths, set(orgs) + + def get_all_repos(self, organizations: list): + """ + Retrieves all repositories for the provided organizations and + verifies basic access for them. + + Docs: https://docs.github.com/en/rest/reference/repos#list-organization-repositories + """ + repos = [] + + for org_path in organizations: + org = org_path.split('/')[0] + try: + for response in self.authed_get_all_pages( + 'get_all_repos', + '{}/orgs/{}/repos?sort=created&direction=desc'.format(self.base_url, org), + should_skip_404 = False + ): + org_repos = response.json() + LOGGER.info("Collected repos for organization: %s", org) + + for repo in org_repos: + repo_full_name = repo.get('full_name') + LOGGER.info("Verifying access of repository: %s", repo_full_name) + + self.verify_repo_access( + '{}/repos/{}/commits'.format(self.base_url,repo_full_name), + repo + ) + + repos.append(repo_full_name) + except NotFoundException: + # Throwing user-friendly error message as it checks token access + message = "HTTP-error-code: 404, Error: Please check the organization name \'{}\' or you do not have sufficient permissions to access this organization.".format(org) + raise NotFoundException(message) from None + + return repos + + def __exit__(self, exception_type, exception_value, traceback): + # Kill the session instance. + self.session.close() diff --git a/tap_github/discover.py b/tap_github/discover.py new file mode 100644 index 00000000..386857ee --- /dev/null +++ b/tap_github/discover.py @@ -0,0 +1,36 @@ +import singer +from singer.catalog import Catalog, CatalogEntry, Schema +from tap_github.schema import get_schemas + +LOGGER = singer.get_logger() + +def discover(client): + """ + Run the discovery mode, prepare the catalog file and return catalog. + """ + # Check credential in the discover mode. + client.verify_access_for_repo() + + schemas, field_metadata = get_schemas() + catalog = Catalog([]) + + for stream_name, schema_dict in schemas.items(): + try: + schema = Schema.from_dict(schema_dict) + mdata = field_metadata[stream_name] + except Exception as err: + LOGGER.error(err) + LOGGER.error('stream_name: %s', stream_name) + LOGGER.error('type schema_dict: %s', type(schema_dict)) + raise err + + key_properties = mdata[0]['metadata'].get('table-key-properties') + catalog.streams.append(CatalogEntry( + stream=stream_name, + tap_stream_id=stream_name, + key_properties= key_properties, + schema=schema, + metadata=mdata + )) + + return catalog.to_dict() diff --git a/tap_github/schema.py b/tap_github/schema.py new file mode 100644 index 00000000..6b65176c --- /dev/null +++ b/tap_github/schema.py @@ -0,0 +1,68 @@ +import os +import json +from singer import metadata +import singer +from tap_github.streams import STREAMS + +def get_abs_path(path): + """ + Get the absolute path for the schema files. + """ + return os.path.join(os.path.dirname(os.path.realpath(__file__)), path) + +def load_schema_references(): + """ + Load the schema files from the schema folder and return the schema references. + """ + shared_schema_path = get_abs_path('schemas/shared') + + shared_file_names = [f for f in os.listdir(shared_schema_path) + if os.path.isfile(os.path.join(shared_schema_path, f))] + + refs = {} + for shared_schema_file in shared_file_names: + with open(os.path.join(shared_schema_path, shared_schema_file)) as data_file: + refs['shared/' + shared_schema_file] = json.load(data_file) + + return refs + +def get_schemas(): + """ + Load the schema references, prepare metadata for each streams and return schema and metadata for the catalog. + """ + schemas = {} + field_metadata = {} + + refs = load_schema_references() + for stream_name, stream_metadata in STREAMS.items(): + schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) + + with open(schema_path) as file: + schema = json.load(file) + + schemas[stream_name] = schema + schema = singer.resolve_schema_references(schema, refs) + + mdata = metadata.new() + mdata = metadata.get_standard_metadata( + schema=schema, + key_properties = (hasattr(stream_metadata, 'key_properties') or None) and stream_metadata.key_properties, + valid_replication_keys = (hasattr(stream_metadata, 'replication_keys') or None) and stream_metadata.replication_keys, + replication_method = (hasattr(stream_metadata, 'replication_method') or None) and stream_metadata.replication_method + ) + mdata = metadata.to_map(mdata) + + # Loop through all keys and make replication keys and primary keys of child stream which are not automatic in parent stream of automatic inclusion + for field_name in schema['properties'].keys(): + + pk_child_fields = (hasattr(stream_metadata, 'pk_child_fields') or None) and stream_metadata.pk_child_fields + replication_keys = (hasattr(stream_metadata, 'replication_keys') or None) and stream_metadata.replication_keys + if (replication_keys and field_name in replication_keys) or (pk_child_fields and field_name in pk_child_fields): + mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') + + + mdata = metadata.to_list(mdata) + field_metadata[stream_name] = mdata + + + return schemas, field_metadata diff --git a/tap_github/schemas/assignees.json b/tap_github/schemas/assignees.json index d6162a7a..5c600dd6 100644 --- a/tap_github/schemas/assignees.json +++ b/tap_github/schemas/assignees.json @@ -2,15 +2,66 @@ "type": ["null", "object"], "additionalProperties": false, "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, "login": { "type": ["null", "string"] }, "id": { "type": ["null", "integer"] }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, "url": { "type": ["null", "string"] }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] + }, "type": { "type": ["null", "string"] }, diff --git a/tap_github/schemas/collaborators.json b/tap_github/schemas/collaborators.json index d6162a7a..9f71ac07 100644 --- a/tap_github/schemas/collaborators.json +++ b/tap_github/schemas/collaborators.json @@ -8,12 +8,66 @@ "id": { "type": ["null", "integer"] }, + "email": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, "url": { "type": ["null", "string"] }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, "type": { "type": ["null", "string"] }, + "site_admin": { + "type": ["null", "boolean"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "role_name": { + "type": ["null", "string"] + }, "_sdc_repository": { "type": ["string"] } diff --git a/tap_github/schemas/comments.json b/tap_github/schemas/comments.json index 7b14a643..ddded9dc 100644 --- a/tap_github/schemas/comments.json +++ b/tap_github/schemas/comments.json @@ -1,5 +1,5 @@ { - "type": "object", + "type": ["null", "object"], "properties": { "id": { "type": ["null", "integer"] @@ -16,6 +16,12 @@ "body": { "type": ["null", "string"] }, + "body_text": { + "type": ["null", "string"] + }, + "body_html": { + "type": ["null", "string"] + }, "html_url": { "type": ["null", "string"] }, @@ -27,63 +33,71 @@ }, "user": { "type": ["null", "object"], - "additionalProperties": false, - "properties": { - "login": { - "type": ["null", "string"] - }, - "id": { - "type": ["null", "integer"] - }, - "node_id": { - "type": ["null", "string"] - }, - "avatar_url": { - "type": ["null", "string"] - }, - "gravatar_id": { - "type": ["null", "string"] - }, - "url": { - "type": ["null", "string"] - }, - "html_url": { - "type": ["null", "string"] - }, - "followers_url": { - "type": ["null", "string"] - }, - "following_url": { - "type": ["null", "string"] - }, - "gists_url": { - "type": ["null", "string"] - }, - "starred_url": { - "type": ["null", "string"] - }, - "subscriptions_url": { - "type": ["null", "string"] - }, - "organizations_url": { - "type": ["null", "string"] - }, - "repos_url": { - "type": ["null", "string"] - }, - "events_url": { - "type": ["null", "string"] - }, - "received_events_url": { - "type": ["null", "string"] - }, - "type": { - "type": ["null", "string"] - }, - "site_admin": { - "type": ["null", "string"] + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "string"] + }, + "starred_at": { + "type": ["null", "string"] + } } - } }, "created_at": { "type": ["null", "string"], @@ -93,6 +107,12 @@ "type": ["null", "string"], "format": "date-time" }, + "performed_via_github_app": { + "$ref": "shared/performed_via_github_app.json#/" + }, + "reactions": { + "$ref": "shared/reactions.json#/" + }, "_sdc_repository": { "type": ["string"] } diff --git a/tap_github/schemas/commit_comments.json b/tap_github/schemas/commit_comments.json index c4c01222..408448dc 100644 --- a/tap_github/schemas/commit_comments.json +++ b/tap_github/schemas/commit_comments.json @@ -1,192 +1,114 @@ { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "html_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "id": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "node_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "body": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "path": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "position": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "line": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "commit_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "user": { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, "login": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "id": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "node_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "avatar_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "gravatar_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "html_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "followers_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "following_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "gists_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "starred_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "subscriptions_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "organizations_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "repos_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "events_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "received_events_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "type": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "site_admin": { - "type": [ - "null", - "boolean" - ] + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] } } }, "created_at": { - "type": [ - "null", - "string" - ], + "type": ["null", "string"], "format": "date-time" }, "updated_at": { - "type": [ - "null", - "string" - ], + "type": ["null", "string"], "format": "date-time" + }, + "author_association": { + "type": ["null", "string"] + }, + "reactions": { + "$ref": "shared/reactions.json#/" } } - } \ No newline at end of file +} diff --git a/tap_github/schemas/commits.json b/tap_github/schemas/commits.json index 0e611d11..cf873448 100644 --- a/tap_github/schemas/commits.json +++ b/tap_github/schemas/commits.json @@ -1,13 +1,27 @@ { "type": ["null", "object"], - "additionalProperties": false, "properties": { "_sdc_repository": { "type": ["string"] }, - "sha": { + "node_id": { + "type": ["null", "string"] + }, + "pr_id": { + "type": ["null", "string"] + }, + "pr_number": { + "type": ["null", "integer"] + }, + "id": { + "type": ["null", "string"] + }, + "updated_at": { "type": ["null", "string"], - "description": "The git commit hash" + "format": "date-time" + }, + "sha": { + "type": ["null", "string"] }, "url": { "type": ["null", "string"] @@ -19,97 +33,61 @@ "additionalProperties": false, "properties": { "sha": { - "type": ["null", "string"], - "description": "The git hash of the parent commit" + "type": ["null", "string"] }, "url": { - "type": ["null", "string"], - "description": "The URL to the parent commit" + "type": ["null", "string"] }, "html_url": { - "type": ["null", "string"], - "description": "The HTML URL to the parent commit" + "type": ["null", "string"] } } } }, "files": { - "type": [ - "null", - "array" - ], + "type": ["null", "array"], "items": { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "filename": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "additions": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "deletions": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "changes": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "status": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "raw_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "blob_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "patch": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] } } } }, "html_url": { - "type": ["null", "string"], - "description": "The HTML URL to the commit" + "type": ["null", "string"] }, "comments_url": { - "type": ["null", "string"], - "description": "The URL to the commit's comments page" + "type": ["null", "string"] }, "commit": { "type": ["null", "object"], "additionalProperties": false, "properties": { "url": { - "type": ["null", "string"], - "description": "The URL to the commit" + "type": ["null", "string"] }, "tree": { "type": ["null", "object"], @@ -125,57 +103,173 @@ }, "author": { "type": ["null", "object"], - "additionalProperties": false, "properties": { - "date": { - "type": ["null", "string"], - "format": "date-time", - "description": "The date the author committed the change" - }, "name": { - "type": ["null", "string"], - "description": "The author's name" + "type": ["null", "string"] }, "email": { - "type": ["null", "string"], - "description": "The author's email" + "type": ["null", "string"] }, "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] + }, + "date": { "type": ["null", "string"], - "description": "The author's login" + "format": "date-time" } } }, "message": { - "type": ["null", "string"], - "description": "The commit message" + "type": ["null", "string"] }, "committer": { "type": ["null", "object"], - "additionalProperties": false, "properties": { - "date": { - "type": ["null", "string"], - "format": "date-time", - "description": "The date the committer committed the change" - }, "name": { - "type": ["null", "string"], - "description": "The committer's name" + "type": ["null", "string"] }, "email": { - "type": ["null", "string"], - "description": "The committer's email" + "type": ["null", "string"] }, "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] + }, + "date": { "type": ["null", "string"], - "description": "The committer's login" + "format": "date-time" } } }, "comment_count": { - "type": ["null", "integer"], - "description": "The number of comments on the commit" + "type": ["null", "integer"] + } + } + }, + "committer": { + "$ref": "shared/user.json#/" + }, + "author": { + "$ref": "shared/user.json#/" + }, + "stats": { + "type": ["null", "object"], + "properties": { + "additions": { + "type": ["null", "integer"] + }, + "deletions": { + "type": ["null", "integer"] + }, + "total": { + "type": ["null", "integer"] } } } diff --git a/tap_github/schemas/events.json b/tap_github/schemas/events.json index 985a39d0..266ef2c8 100644 --- a/tap_github/schemas/events.json +++ b/tap_github/schemas/events.json @@ -1,223 +1,1015 @@ { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "_sdc_repository": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "actor": { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "avatar_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "display_login": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "gravatar_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "id": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "login": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] } } }, "created_at": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "id": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "org": { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "avatar_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "gravatar_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "id": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "login": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] + }, + "display_login": { + "type": ["null", "string"] } } }, "payload": { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "before": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "action": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "comment": { - "type": [ - "null", - "string" - ] + "type": ["null", "object", "string"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "body": { + "type": ["null", "string"] + }, + "body_text": { + "type": ["null", "string"] + }, + "body_html": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "user": { + "$ref": "shared/user.json#/" + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "issue_url": { + "type": ["null", "string"], + "format": "uri" + }, + "author_association": { + "type": ["null", "string"] + }, + "performed_via_github_app": { + "$ref": "shared/performed_via_github_app.json#/" + }, + "reactions": { + "$ref": "shared/reactions.json#/" + } + } }, "issue": { - "type": [ - "null", - "string" - ] + "type": ["null", "object", "string"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "repository_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "number": { + "type": ["null", "integer"] + }, + "state": { + "type": ["null", "string"] + }, + "state_reason": { + "type": ["null", "string"] + }, + "title": { + "type": ["null", "string"] + }, + "body": { + "type": ["null", "string"] + }, + "user": { + "$ref": "shared/user.json#/" + }, + "labels": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"], + "format": "int64" + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"], + "format": "uri" + }, + "name": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "color": { + "type": ["null", "string"] + }, + "default": { + "type": ["null", "boolean"] + } + } + }, + "assignee": { + "$ref": "shared/user.json#/" + }, + "assignees": { + "type": ["null", "array"], + "items": { + "$ref": "shared/user.json#/" + } + }, + "milestone": { + "type": ["null", "object"], + "properties": { + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "number": { + "type": ["null", "integer"] + }, + "state": { + "type": ["null", "string"] + }, + "title": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "creator": { + "$ref": "shared/user.json#/" + }, + "open_issues": { + "type": ["null", "integer"] + }, + "closed_issues": { + "type": ["null", "integer"] + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "closed_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "due_on": { + "type": ["null", "string"], + "format": "date-time" + } + } + }, + "locked": { + "type": ["null", "boolean"] + }, + "active_lock_reason": { + "type": ["null", "string"] + }, + "comments": { + "type": ["null", "integer"] + }, + "pull_request": { + "type": ["null", "object"], + "properties": { + "merged_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "diff_url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "patch_url": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + } + } + }, + "closed_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "draft": { + "type": ["null", "boolean"] + }, + "closed_by": { + "$ref": "shared/user.json#/" + }, + "body_html": { + "type": ["null", "string"] + }, + "body_text": { + "type": ["null", "string"] + }, + "timeline_url": { + "type": ["null", "string"] + }, + "repository": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "license": { + "type": ["null", "object"], + "properties": { + "key": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "spdx_id": { + "type": ["null", "string"] + }, + "node_id": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + } + } + }, + "organization": { + "$ref": "shared/user.json#/" + }, + "forks": { + "type": ["null", "integer"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "private": { + "type": ["null", "boolean"] + }, + "html_url": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] + }, + "url": { + "type": ["null", "string"] + }, + "archive_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "forks_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "language": { + "type": ["null", "string"] + }, + "forks_count": { + "type": ["null", "integer"] + }, + "stargazers_count": { + "type": ["null", "integer"] + }, + "watchers_count": { + "type": ["null", "integer"] + }, + "size": { + "type": ["null", "integer"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "open_issues_count": { + "type": ["null", "integer"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["null", "string"] + } + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "visibility": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "allow_rebase_merge": { + "type": ["null", "boolean"] + }, + "template_repository": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "owner": { + "type": ["null", "object"], + "properties": { + "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + } + } + }, + "private": { + "type": ["null", "boolean"] + }, + "html_url": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] + }, + "url": { + "type": ["null", "string"] + }, + "archive_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "forks_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "language": { + "type": ["null", "string"] + }, + "forks_count": { + "type": ["null", "integer"] + }, + "stargazers_count": { + "type": ["null", "integer"] + }, + "watchers_count": { + "type": ["null", "integer"] + }, + "size": { + "type": ["null", "integer"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "open_issues_count": { + "type": ["null", "integer"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["null", "string"] + } + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "visibility": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"] + }, + "created_at": { + "type": ["null", "string"] + }, + "updated_at": { + "type": ["null", "string"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "allow_rebase_merge": { + "type": ["null", "boolean"] + }, + "temp_clone_token": { + "type": ["null", "string"] + }, + "allow_squash_merge": { + "type": ["null", "boolean"] + }, + "allow_auto_merge": { + "type": ["null", "boolean"] + }, + "delete_branch_on_merge": { + "type": ["null", "boolean"] + }, + "allow_update_branch": { + "type": ["null", "boolean"] + }, + "use_squash_pr_title_as_default": { + "type": ["null", "boolean"] + }, + "allow_merge_commit": { + "type": ["null", "boolean"] + }, + "subscribers_count": { + "type": ["null", "integer"] + }, + "network_count": { + "type": ["null", "integer"] + } + } + }, + "temp_clone_token": { + "type": ["null", "string"] + }, + "allow_squash_merge": { + "type": ["null", "boolean"] + }, + "allow_auto_merge": { + "type": ["null", "boolean"] + }, + "delete_branch_on_merge": { + "type": ["null", "boolean"] + }, + "allow_update_branch": { + "type": ["null", "boolean"] + }, + "use_squash_pr_title_as_default": { + "type": ["null", "boolean"] + }, + "allow_merge_commit": { + "type": ["null", "boolean"] + }, + "allow_forking": { + "type": ["null", "boolean"] + }, + "subscribers_count": { + "type": ["null", "integer"] + }, + "network_count": { + "type": ["null", "integer"] + }, + "open_issues": { + "type": ["null", "integer"] + }, + "watchers": { + "type": ["null", "integer"] + }, + "master_branch": { + "type": ["null", "string"] + }, + "starred_at": { + "type": ["null", "string"] + } + } + }, + "performed_via_github_app": { + "$ref": "shared/performed_via_github_app.json#/" + }, + "author_association": { + "type": ["null", "string"] + }, + "reactions": { + "$ref": "shared/reactions.json#/" + } + } + } }, "description": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "master_branch": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "pusher_type": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "ref": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "ref_type": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "commits": { - "type": [ - "null", - "array" - ], + "type": ["null", "array"], "items": { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "author": { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "email": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "name": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] } } }, "distinct": { - "type": [ - "null", - "boolean" - ] + "type": ["null", "boolean"] }, "message": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "sha": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] + } + } + } + }, + "pages": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"], + "properties": { + "page_name": { + "type": ["null", "string"] + }, + "title": { + "type": ["null", "string"] + }, + "summary": { + "type": ["null", "string"] + }, + "action": { + "type": ["null", "string"] + }, + "sha": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] } } } @@ -225,72 +1017,39 @@ } }, "distinct_size": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "head": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "push_id": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "ref": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "size": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "public": { - "type": [ - "null", - "boolean" - ] + "type": ["null", "boolean"] }, "repo": { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "id": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "name": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] } } }, "type": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] } } } \ No newline at end of file diff --git a/tap_github/schemas/issue_events.json b/tap_github/schemas/issue_events.json index ddd494ee..711b7f71 100644 --- a/tap_github/schemas/issue_events.json +++ b/tap_github/schemas/issue_events.json @@ -1,1473 +1,963 @@ { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "commit_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "created_at": { - "type": [ - "null", - "string" - ], + "type": ["null", "string"], "format": "date-time" }, "commit_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "event": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "issue": { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "user": { - "type": [ - "null", - "object" - ], - "properties": { - "gravatar_id": { - "type": [ - "null", - "string" - ] - }, - "received_events_url": { - "type": [ - "null", - "string" - ] - }, - "url": { - "type": [ - "null", - "string" - ] - }, - "subscriptions_url": { - "type": [ - "null", - "string" - ] - }, - "gists_url": { - "type": [ - "null", - "string" - ] - }, - "html_url": { - "type": [ - "null", - "string" - ] - }, - "repos_url": { - "type": [ - "null", - "string" - ] - }, - "events_url": { - "type": [ - "null", - "string" - ] - }, - "login": { - "type": [ - "null", - "string" - ] - }, - "followers_url": { - "type": [ - "null", - "string" - ] - }, - "starred_url": { - "type": [ - "null", - "string" - ] - }, - "avatar_url": { - "type": [ - "null", - "string" - ] - }, - "id": { - "type": [ - "null", - "integer" - ] - }, - "type": { - "type": [ - "null", - "string" - ] - }, - "site_admin": { - "type": [ - "null", - "boolean" - ] - }, - "node_id": { - "type": [ - "null", - "string" - ] - }, - "organizations_url": { - "type": [ - "null", - "string" - ] - }, - "following_url": { - "type": [ - "null", - "string" - ] - } - } + "$ref": "shared/user.json#/" }, "comments": { - "type": [ - "null", - "integer" - ] + "type": ["null", "integer"] }, "author_association": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "milestone": { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "closed_at": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"], + "format": "date-time" }, "closed_issues": { - "type": [ - "null", - "integer" - ] + "type": ["null", "integer"] }, "created_at": { - "type": [ - "null", - "string" - ], + "type": ["null", "string"], "format": "date-time" }, "creator": { - "type": [ - "null", - "object" - ], - "properties": { - "avatar_url": { - "type": [ - "null", - "string" - ] - }, - "events_url": { - "type": [ - "null", - "string" - ] - }, - "followers_url": { - "type": [ - "null", - "string" - ] - }, - "following_url": { - "type": [ - "null", - "string" - ] - }, - "gists_url": { - "type": [ - "null", - "string" - ] - }, - "gravatar_id": { - "type": [ - "null", - "string" - ] - }, - "html_url": { - "type": [ - "null", - "string" - ] - }, - "id": { - "type": [ - "null", - "integer" - ] - }, - "login": { - "type": [ - "null", - "string" - ] - }, - "node_id": { - "type": [ - "null", - "string" - ] - }, - "organizations_url": { - "type": [ - "null", - "string" - ] - }, - "received_events_url": { - "type": [ - "null", - "string" - ] - }, - "repos_url": { - "type": [ - "null", - "string" - ] - }, - "site_admin": { - "type": [ - "null", - "boolean" - ] - }, - "starred_url": { - "type": [ - "null", - "string" - ] - }, - "subscriptions_url": { - "type": [ - "null", - "string" - ] - }, - "type": { - "type": [ - "null", - "string" - ] - }, - "url": { - "type": [ - "null", - "string" - ] - } - } + "$ref": "shared/user.json#/" }, "description": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "due_on": { - "type": [ - "null", - "string" - ], + "type": ["null", "string"], "format": "date-time" }, "html_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "id": { - "type": [ - "null", - "integer" - ] + "type": ["null", "integer"] }, "labels_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "node_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "number": { - "type": [ - "null", - "integer" - ] + "type": ["null", "integer"] }, "open_issues": { - "type": [ - "null", - "integer" - ] + "type": ["null", "integer"] }, "state": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "title": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "updated_at": { - "type": [ - "null", - "string" - ], + "type": ["null", "string"], "format": "date-time" }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] } } }, + "closed_by": { + "$ref": "shared/user.json#/" + }, "closed_at": { - "type": [ - "null", - "string" - ], + "type": ["null", "string"], "format": "date-time" }, "html_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "body": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "active_lock_reason": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "performed_via_github_app": { - "type": [ - "null", - "string" - ] + "$ref": "shared/performed_via_github_app.json#/" }, "assignee": { - "type": [ - "null", - "object" - ], - "properties": { - "gravatar_id": { - "type": [ - "null", - "string" - ] - }, - "received_events_url": { - "type": [ - "null", - "string" - ] - }, - "url": { - "type": [ - "null", - "string" - ] - }, - "subscriptions_url": { - "type": [ - "null", - "string" - ] - }, - "gists_url": { - "type": [ - "null", - "string" - ] - }, - "html_url": { - "type": [ - "null", - "string" - ] - }, - "repos_url": { - "type": [ - "null", - "string" - ] - }, - "events_url": { - "type": [ - "null", - "string" - ] - }, - "login": { - "type": [ - "null", - "string" - ] - }, - "followers_url": { - "type": [ - "null", - "string" - ] - }, - "starred_url": { - "type": [ - "null", - "string" - ] - }, - "avatar_url": { - "type": [ - "null", - "string" - ] - }, - "id": { - "type": [ - "null", - "integer" - ] - }, - "type": { - "type": [ - "null", - "string" - ] - }, - "site_admin": { - "type": [ - "null", - "boolean" - ] - }, - "node_id": { - "type": [ - "null", - "string" - ] - }, - "organizations_url": { - "type": [ - "null", - "string" - ] - }, - "following_url": { - "type": [ - "null", - "string" - ] - } - } + "$ref": "shared/user.json#/" }, "id": { - "type": [ - "null", - "integer" - ] + "type": ["null", "integer"] }, "pull_request": { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "diff_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "html_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "patch_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] + }, + "merged_at": { + "type": ["null", "string"], + "format": "date-time" } } }, "comments_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "locked": { - "type": [ - "null", - "boolean" - ] + "type": ["null", "boolean"] }, "labels_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "events_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "updated_at": { - "type": [ - "null", - "string" - ], + "type": ["null", "string"], "format": "date-time" }, "state": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "repository_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "number": { - "type": [ - "null", - "integer" - ] + "type": ["null", "integer"] }, "created_at": { - "type": [ - "null", - "string" - ], + "type": ["null", "string"], "format": "date-time" }, "title": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "node_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "labels": { - "type": [ - "null", - "array" - ], + "type": ["null", "array"], "items": { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "name": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "description": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "id": { - "type": [ - "null", - "integer" - ] + "type": ["null", "integer"] }, "node_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "color": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "default": { - "type": [ - "null", - "boolean" - ] + "type": ["null", "boolean"] } } } }, - "assignees": { - "type": [ - "null", - "array" - ], - "items": { - "type": [ - "null", - "object" - ], - "properties": { - "gravatar_id": { - "type": [ - "null", - "string" - ] - }, - "received_events_url": { - "type": [ - "null", - "string" - ] - }, - "url": { - "type": [ - "null", - "string" - ] - }, - "subscriptions_url": { - "type": [ - "null", - "string" - ] - }, - "gists_url": { - "type": [ - "null", - "string" - ] - }, - "html_url": { - "type": [ - "null", - "string" - ] - }, - "repos_url": { - "type": [ - "null", - "string" - ] - }, - "events_url": { - "type": [ - "null", - "string" - ] - }, - "login": { - "type": [ - "null", - "string" - ] - }, - "followers_url": { - "type": [ - "null", - "string" - ] - }, - "starred_url": { - "type": [ - "null", - "string" - ] - }, - "avatar_url": { - "type": [ - "null", - "string" - ] - }, - "id": { - "type": [ - "null", - "integer" - ] - }, - "type": { - "type": [ - "null", - "string" - ] - }, - "site_admin": { - "type": [ - "null", - "boolean" - ] - }, - "node_id": { - "type": [ - "null", - "string" - ] - }, - "organizations_url": { - "type": [ - "null", - "string" - ] - }, - "following_url": { - "type": [ - "null", - "string" - ] + "reactions": { + "$ref": "shared/reactions.json#/" + }, + "repository": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "license": { + "type": ["null", "object"], + "properties": { + "key": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "spdx_id": { + "type": ["null", "string"] + }, + "node_id": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + } + } + }, + "organization": { + "$ref": "shared/user.json#/" + }, + "forks": { + "type": ["null", "integer"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "private": { + "type": ["null", "boolean"] + }, + "html_url": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] + }, + "url": { + "type": ["null", "string"] + }, + "archive_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "forks_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "language": { + "type": ["null", "string"] + }, + "forks_count": { + "type": ["null", "integer"] + }, + "stargazers_count": { + "type": ["null", "integer"] + }, + "watchers_count": { + "type": ["null", "integer"] + }, + "size": { + "type": ["null", "integer"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "open_issues_count": { + "type": ["null", "integer"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["null", "string"] + } + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "visibility": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "allow_rebase_merge": { + "type": ["null", "boolean"] + }, + "template_repository": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "private": { + "type": ["null", "boolean"] + }, + "html_url": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] + }, + "url": { + "type": ["null", "string"] + }, + "archive_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "forks_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "language": { + "type": ["null", "string"] + }, + "forks_count": { + "type": ["null", "integer"] + }, + "stargazers_count": { + "type": ["null", "integer"] + }, + "watchers_count": { + "type": ["null", "integer"] + }, + "size": { + "type": ["null", "integer"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "open_issues_count": { + "type": ["null", "integer"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["null", "string"] + } + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "visibility": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"] + }, + "created_at": { + "type": ["null", "string"] + }, + "updated_at": { + "type": ["null", "string"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "allow_rebase_merge": { + "type": ["null", "boolean"] + }, + "temp_clone_token": { + "type": ["null", "string"] + }, + "allow_squash_merge": { + "type": ["null", "boolean"] + }, + "allow_auto_merge": { + "type": ["null", "boolean"] + }, + "delete_branch_on_merge": { + "type": ["null", "boolean"] + }, + "allow_update_branch": { + "type": ["null", "boolean"] + }, + "use_squash_pr_title_as_default": { + "type": ["null", "boolean"] + }, + "allow_merge_commit": { + "type": ["null", "boolean"] + }, + "subscribers_count": { + "type": ["null", "integer"] + }, + "network_count": { + "type": ["null", "integer"] + } } + }, + "temp_clone_token": { + "type": ["null", "string"] + }, + "allow_squash_merge": { + "type": ["null", "boolean"] + }, + "allow_auto_merge": { + "type": ["null", "boolean"] + }, + "delete_branch_on_merge": { + "type": ["null", "boolean"] + }, + "allow_update_branch": { + "type": ["null", "boolean"] + }, + "use_squash_pr_title_as_default": { + "type": ["null", "boolean"] + }, + "allow_merge_commit": { + "type": ["null", "boolean"] + }, + "allow_forking": { + "type": ["null", "boolean"] + }, + "subscribers_count": { + "type": ["null", "integer"] + }, + "network_count": { + "type": ["null", "integer"] + }, + "open_issues": { + "type": ["null", "integer"] + }, + "watchers": { + "type": ["null", "integer"] + }, + "master_branch": { + "type": ["null", "string"] + }, + "starred_at": { + "type": ["null", "string"] } } + }, + "assignees": { + "type": ["null", "array"], + "items": { + "$ref": "shared/user.json#/" + } } } }, "id": { - "type": [ - "null", - "integer" - ] + "type": ["null", "integer"] }, "actor": { - "type": [ - "null", - "object" - ], - "properties": { - "gravatar_id": { - "type": [ - "null", - "string" - ] - }, - "received_events_url": { - "type": [ - "null", - "string" - ] - }, - "url": { - "type": [ - "null", - "string" - ] - }, - "subscriptions_url": { - "type": [ - "null", - "string" - ] - }, - "gists_url": { - "type": [ - "null", - "string" - ] - }, - "html_url": { - "type": [ - "null", - "string" - ] - }, - "repos_url": { - "type": [ - "null", - "string" - ] - }, - "events_url": { - "type": [ - "null", - "string" - ] - }, - "login": { - "type": [ - "null", - "string" - ] - }, - "followers_url": { - "type": [ - "null", - "string" - ] - }, - "starred_url": { - "type": [ - "null", - "string" - ] - }, - "avatar_url": { - "type": [ - "null", - "string" - ] - }, - "id": { - "type": [ - "null", - "integer" - ] - }, - "type": { - "type": [ - "null", - "string" - ] - }, - "site_admin": { - "type": [ - "null", - "boolean" - ] - }, - "node_id": { - "type": [ - "null", - "string" - ] - }, - "organizations_url": { - "type": [ - "null", - "string" - ] - }, - "following_url": { - "type": [ - "null", - "string" - ] - } - } + "$ref": "shared/user.json#/" }, "node_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "performed_via_github_app": { - "type": [ - "null", - "string" - ] + "$ref": "shared/performed_via_github_app.json#/" }, "_sdc_repository": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "rename": { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "to": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "from": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] } } }, "label": { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "name": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "color": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] } } }, "requested_reviewer": { - "type": [ - "null", - "object" - ], + "$ref": "shared/user.json#/" + }, + "review_requester": { + "$ref": "shared/user.json#/" + }, + "requested_team": { + "type": ["null", "object"], "properties": { - "gravatar_id": { - "type": [ - "null", - "string" - ] - }, - "received_events_url": { - "type": [ - "null", - "string" - ] - }, - "url": { - "type": [ - "null", - "string" - ] - }, - "subscriptions_url": { - "type": [ - "null", - "string" - ] - }, - "gists_url": { - "type": [ - "null", - "string" - ] - }, - "html_url": { - "type": [ - "null", - "string" - ] + "id": { + "type": ["null", "integer"] }, - "repos_url": { - "type": [ - "null", - "string" - ] + "node_id": { + "type": ["null", "string"] }, - "events_url": { - "type": [ - "null", - "string" - ] + "name": { + "type": ["null", "string"] }, - "login": { - "type": [ - "null", - "string" - ] + "slug": { + "type": ["null", "string"] }, - "followers_url": { - "type": [ - "null", - "string" - ] + "description": { + "type": ["null", "string"] }, - "starred_url": { - "type": [ - "null", - "string" - ] + "privacy": { + "type": ["null", "string"] }, - "avatar_url": { - "type": [ - "null", - "string" - ] + "permission": { + "type": ["null", "string"] }, - "id": { - "type": [ - "null", - "integer" - ] + "permissions": { + "$ref": "shared/pull_permissions.json#/" }, - "type": { - "type": [ - "null", - "string" - ] + "url": { + "type": ["null", "string"] }, - "site_admin": { - "type": [ - "null", - "boolean" - ] + "html_url": { + "type": ["null", "string"] }, - "node_id": { - "type": [ - "null", - "string" - ] + "members_url": { + "type": ["null", "string"] }, - "organizations_url": { - "type": [ - "null", - "string" - ] + "repositories_url": { + "type": ["null", "string"] }, - "following_url": { - "type": [ - "null", - "string" - ] + "parent": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "members_url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "permission": { + "type": ["null", "string"] + }, + "privacy": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "repositories_url": { + "type": ["null", "string"] + }, + "slug": { + "type": ["null", "string"] + }, + "ldap_dn": { + "type": ["null", "string"] + } + } } } }, - "review_requester": { - "type": [ - "null", - "object" - ], + "dismissed_review": { + "type": ["null", "object"], "properties": { - "gravatar_id": { - "type": [ - "null", - "string" - ] - }, - "received_events_url": { - "type": [ - "null", - "string" - ] - }, - "url": { - "type": [ - "null", - "string" - ] - }, - "subscriptions_url": { - "type": [ - "null", - "string" - ] - }, - "gists_url": { - "type": [ - "null", - "string" - ] - }, - "html_url": { - "type": [ - "null", - "string" - ] - }, - "repos_url": { - "type": [ - "null", - "string" - ] - }, - "events_url": { - "type": [ - "null", - "string" - ] - }, - "login": { - "type": [ - "null", - "string" - ] - }, - "followers_url": { - "type": [ - "null", - "string" - ] - }, - "starred_url": { - "type": [ - "null", - "string" - ] - }, - "avatar_url": { - "type": [ - "null", - "string" - ] - }, - "id": { - "type": [ - "null", - "integer" - ] - }, - "type": { - "type": [ - "null", - "string" - ] - }, - "site_admin": { - "type": [ - "null", - "boolean" - ] + "state": { + "type": ["null", "string"] }, - "node_id": { - "type": [ - "null", - "string" - ] + "review_id": { + "type": ["null", "integer"] }, - "organizations_url": { - "type": [ - "null", - "string" - ] + "dismissal_message": { + "type": ["null", "string"] }, - "following_url": { - "type": [ - "null", - "string" - ] + "dismissal_commit_id": { + "type": ["null", "string"] } } }, - "assignee": { - "type": [ - "null", - "object" - ], + "milestone": { + "type": ["null", "object"], "properties": { - "gravatar_id": { - "type": [ - "null", - "string" - ] - }, - "received_events_url": { - "type": [ - "null", - "string" - ] - }, - "url": { - "type": [ - "null", - "string" - ] - }, - "subscriptions_url": { - "type": [ - "null", - "string" - ] - }, - "gists_url": { - "type": [ - "null", - "string" - ] - }, - "html_url": { - "type": [ - "null", - "string" - ] - }, - "repos_url": { - "type": [ - "null", - "string" - ] - }, - "events_url": { - "type": [ - "null", - "string" - ] - }, - "login": { - "type": [ - "null", - "string" - ] - }, - "followers_url": { - "type": [ - "null", - "string" - ] - }, - "starred_url": { - "type": [ - "null", - "string" - ] - }, - "avatar_url": { - "type": [ - "null", - "string" - ] - }, - "id": { - "type": [ - "null", - "integer" - ] - }, - "type": { - "type": [ - "null", - "string" - ] - }, - "site_admin": { - "type": [ - "null", - "boolean" - ] - }, - "node_id": { - "type": [ - "null", - "string" - ] - }, - "organizations_url": { - "type": [ - "null", - "string" - ] - }, - "following_url": { - "type": [ - "null", - "string" - ] + "title": { + "type":["null", "string"] } } }, - "assigner": { - "type": [ - "null", - "object" - ], + "project_card": { + "type": ["null", "object"], "properties": { - "gravatar_id": { - "type": [ - "null", - "string" - ] - }, - "received_events_url": { - "type": [ - "null", - "string" - ] - }, "url": { - "type": [ - "null", - "string" - ] - }, - "subscriptions_url": { - "type": [ - "null", - "string" - ] - }, - "gists_url": { - "type": [ - "null", - "string" - ] - }, - "html_url": { - "type": [ - "null", - "string" - ] - }, - "repos_url": { - "type": [ - "null", - "string" - ] - }, - "events_url": { - "type": [ - "null", - "string" - ] - }, - "login": { - "type": [ - "null", - "string" - ] - }, - "followers_url": { - "type": [ - "null", - "string" - ] - }, - "starred_url": { - "type": [ - "null", - "string" - ] - }, - "avatar_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "id": { - "type": [ - "null", - "integer" - ] + "type": ["null", "integer"] }, - "type": { - "type": [ - "null", - "string" - ] + "project_url": { + "type": ["null", "string"] }, - "site_admin": { - "type": [ - "null", - "boolean" - ] + "project_id": { + "type": ["null", "integer"] }, - "node_id": { - "type": [ - "null", - "string" - ] - }, - "organizations_url": { - "type": [ - "null", - "string" - ] + "column_name": { + "type": ["null", "string"] }, - "following_url": { - "type": [ - "null", - "string" - ] + "previous_column_name": { + "type": ["null", "string"] } } + }, + "draft": { + "type": ["null", "boolean"] + }, + "author_association": { + "type": ["null", "string"] + }, + "lock_reason": { + "type": ["null", "string"] + }, + "assignee": { + "$ref": "shared/user.json#/" + }, + "assigner": { + "$ref": "shared/user.json#/" } } } \ No newline at end of file diff --git a/tap_github/schemas/issue_labels.json b/tap_github/schemas/issue_labels.json index d1962337..32a097df 100644 --- a/tap_github/schemas/issue_labels.json +++ b/tap_github/schemas/issue_labels.json @@ -1,56 +1,29 @@ { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "_sdc_repository": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "id": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "node_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "name": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "description": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "color": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "default": { - "type": [ - "null", - "boolean" - ] + "type": ["null", "boolean"] } } - } \ No newline at end of file +} diff --git a/tap_github/schemas/issue_milestones.json b/tap_github/schemas/issue_milestones.json index d2c2f372..eb14f446 100644 --- a/tap_github/schemas/issue_milestones.json +++ b/tap_github/schemas/issue_milestones.json @@ -1,224 +1,125 @@ { - "type": [ - "null", - "object" - ], - "properties": { - "_sdc_repository": { - "type": [ - "null", - "string" - ] - }, - "url": { - "type": [ - "null", - "string" - ] - }, - "html_url": { - "type": [ - "null", - "string" - ] - }, - "labels_url": { - "type": [ - "null", - "string" - ] - }, - "id": { - "type": [ - "null", - "number" - ] - }, - "node_id": { - "type": [ - "null", - "string" - ] - }, - "number": { - "type": [ - "null", - "number" - ] - }, - "state": { - "type": [ - "null", - "string" - ] - }, - "title": { - "type": [ - "null", - "string" - ] - }, - "description": { - "type": [ - "null", - "string" - ] - }, - "creator": { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "number": { + "type": ["null", "number"] + }, + "state": { + "type": ["null", "string"] + }, + "title": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "creator": { + "type": ["null", "object"], "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, "login": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "id": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "node_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "avatar_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "gravatar_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "html_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "followers_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "following_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "gists_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "starred_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "subscriptions_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "organizations_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "repos_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "events_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "received_events_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "type": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "site_admin": { - "type": [ - "null", - "boolean" - ] + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] } } - }, - "open_issues": { - "type": [ - "null", - "number" - ] - }, - "closed_issues": { - "type": [ - "null", - "number" - ] - }, - "created_at": { - "type": [ - "null", - "string" - ], - "format": "date-time" - }, - "updated_at": { - "type": [ - "null", - "string" - ], - "format": "date-time" - }, - "closed_at": { - "type": [ - "null", - "string" - ], - "format": "date-time" - }, - "due_on": { - "type": [ - "null", - "string" - ], - "format": "date-time" - } + }, + "open_issues": { + "type": ["null", "number"] + }, + "closed_issues": { + "type": ["null", "number"] + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "closed_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "due_on": { + "type": ["null", "string"], + "format": "date-time" } - } \ No newline at end of file + } +} diff --git a/tap_github/schemas/issues.json b/tap_github/schemas/issues.json index 81737ce1..93365708 100644 --- a/tap_github/schemas/issues.json +++ b/tap_github/schemas/issues.json @@ -1,21 +1,18 @@ { "properties": { "state": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] + }, + "state_reason": { + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "labels": { "type": ["null", "array"], "items": { - "type": "object", + "type": ["null", "object"], "properties": { "id": { "type": ["null", "integer"] @@ -42,265 +39,238 @@ } }, "repository_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "number": { - "type": [ - "null", - "integer" - ] + "type": ["null", "integer"] }, "closed_at": { - "type": [ - "null", - "string" - ], + "type": ["null", "string"], "format": "date-time" }, "labels_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "title": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "assignee": { - "type": [ - "null", - "object" - ], - "properties": {} + "$ref": "shared/user.json#/" + }, + "assignees": { + "type": ["null", "array"], + "items": { + "$ref": "shared/user.json#/" + } + }, + "milestone": { + "type": ["null", "object"], + "properties": { + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "number": { + "type": ["null", "integer"] + }, + "state": { + "type": ["null", "string"] + }, + "title": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "creator": { + "$ref": "shared/user.json#/" + }, + "open_issues": { + "type": ["null", "integer"] + }, + "closed_issues": { + "type": ["null", "integer"] + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "closed_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "due_on": { + "type": ["null", "string"], + "format": "date-time" + } + } + }, + "reactions": { + "$ref": "shared/reactions.json#/" + }, + "active_lock_reason": { + "type": ["null", "string"] + }, + "body_html": { + "type": ["null", "string"] + }, + "performed_via_github_app": { + "$ref": "shared/performed_via_github_app.json#/" + }, + "timeline_url": { + "type": ["null", "string"] + }, + "closed_by": { + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] + } + } }, "updated_at": { - "type": [ - "null", - "string" - ], + "type": ["null", "string"], "format": "date-time" }, "html_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "author_association": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "locked": { - "type": [ - "null", - "boolean" - ] + "type": ["null", "boolean"] }, "events_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "pull_request": { "properties": { "diff_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "html_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "patch_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] + }, + "merged_at": { + "type": ["null", "string"], + "format": "date-time" } }, - "type": [ - "null", - "object" - ] + "type": ["null", "object"] }, "node_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "body": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "comments": { - "type": [ - "null", - "integer" - ] + "type": ["null", "integer"] }, "created_at": { - "type": [ - "null", - "string" - ], + "type": ["null", "string"], "format": "date-time" }, "_sdc_repository": { "type": ["string"] }, "user": { - "properties": { - "repos_url": { - "type": [ - "null", - "string" - ] - }, - "starred_url": { - "type": [ - "null", - "string" - ] - }, - "url": { - "type": [ - "null", - "string" - ] - }, - "received_events_url": { - "type": [ - "null", - "string" - ] - }, - "site_admin": { - "type": [ - "null", - "boolean" - ] - }, - "gravatar_id": { - "type": [ - "null", - "string" - ] - }, - "following_url": { - "type": [ - "null", - "string" - ] - }, - "avatar_url": { - "type": [ - "null", - "string" - ] - }, - "events_url": { - "type": [ - "null", - "string" - ] - }, - "id": { - "type": [ - "null", - "integer" - ] - }, - "login": { - "type": [ - "null", - "string" - ] - }, - "organizations_url": { - "type": [ - "null", - "string" - ] - }, - "html_url": { - "type": [ - "null", - "string" - ] - }, - "type": { - "type": [ - "null", - "string" - ] - }, - "subscriptions_url": { - "type": [ - "null", - "string" - ] - }, - "node_id": { - "type": [ - "null", - "string" - ] - }, - "followers_url": { - "type": [ - "null", - "string" - ] - }, - "gists_url": { - "type": [ - "null", - "string" - ] - } - }, - "type": [ - "null", - "object" - ] + "$ref": "shared/user.json#/" }, "id": { - "type": [ - "null", - "integer" - ] + "type": ["null", "integer"] }, "comments_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] + }, + "body_text": { + "type": ["null", "string"] + }, + "draft": { + "type": ["null", "boolean"] } }, - "type": [ - "null", - "object" - ] + "type": ["null", "object"] } diff --git a/tap_github/schemas/pr_commits.json b/tap_github/schemas/pr_commits.json new file mode 100644 index 00000000..f4fa2f82 --- /dev/null +++ b/tap_github/schemas/pr_commits.json @@ -0,0 +1,323 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["string"] + }, + "sha": { + "type": ["null", "string"], + "description": "The git commit hash" + }, + "node_id": { + "type": ["null","string"] + }, + "url": { + "type": ["null", "string"] + }, + "parents": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"], + "additionalProperties": false, + "properties": { + "sha": { + "type": ["null", "string"], + "description": "The git hash of the parent commit" + }, + "url": { + "type": ["null", "string"], + "description": "The URL to the parent commit" + }, + "html_url": { + "type": ["null", "string"], + "description": "The HTML URL to the parent commit" + } + } + } + }, + "files": { + "type": ["null","array"], + "items": { + "type": ["null","object"], + "properties": { + "filename": { + "type": ["null","string"] + }, + "additions": { + "type": ["null","number"] + }, + "deletions": { + "type": ["null","number"] + }, + "changes": { + "type": ["null","number"] + }, + "status": { + "type": ["null","string"] + }, + "raw_url": { + "type": ["null","string"] + }, + "blob_url": { + "type": ["null","string"] + }, + "contents_url": { + "type": ["null","string"] + }, + "sha": { + "type": ["null","string"] + }, + "patch": { + "type": ["null","string"] + } + } + } + }, + "html_url": { + "type": ["null", "string"], + "description": "The HTML URL to the commit" + }, + "comments_url": { + "type": ["null", "string"], + "description": "The URL to the commit's comments page" + }, + "commit": { + "type": ["null", "object"], + "additionalProperties": false, + "properties": { + "url": { + "type": ["null", "string"], + "description": "The URL to the commit" + }, + "tree": { + "type": ["null", "object"], + "additionalProperties": false, + "properties": { + "sha": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + } + } + }, + "author": { + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] + }, + "date": { + "type": ["null", "string"], + "format": "date-time" + } + } + }, + "verification": { + "type": ["null", "object"], + "properties": { + "verified": { + "type": ["null", "boolean"] + }, + "reason": { + "type": ["null", "string"] + }, + "payload": { + "type": ["null", "string"] + }, + "signature": { + "type": ["null", "string"] + } + } + }, + "message": { + "type": ["null", "string"], + "description": "The commit message" + }, + "comment_count": { + "type": ["null", "integer"] + }, + "committer": { + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "login": { + "type": ["null", "string"] + }, + "date": { + "type": ["null", "string"], + "format": "date-time" + } + } + } + } + }, + "committer": { + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] + }, + "date": { + "type": ["null", "string"], + "format": "date-time" + } + } + }, + "pr_number": { + "type": ["null", "integer"] + }, + "pr_id": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "string"] + }, + "author": { + "$ref": "shared/user.json#/" + }, + "stats": { + "type": ["null", "object"], + "properties": { + "additions": { + "type": ["null", "integer"] + }, + "deletions": { + "type": ["null", "integer"] + }, + "total": { + "type": ["null", "integer"] + } + } + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + } + } +} \ No newline at end of file diff --git a/tap_github/schemas/project_cards.json b/tap_github/schemas/project_cards.json index f0bc15ab..bb62bdf2 100644 --- a/tap_github/schemas/project_cards.json +++ b/tap_github/schemas/project_cards.json @@ -1,108 +1,120 @@ { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "_sdc_repository": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] + }, + "project_id": { + "type": ["null", "string"] + }, + "column_name": { + "type": ["null", "string"] }, "cards_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "name": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "id": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "node_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "note": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "creator": { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, "login": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "id": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] } } }, "created_at": { - "type": [ - "null", - "string" - ], + "type": ["null", "string"], "format": "date-time" }, "updated_at": { - "type": [ - "null", - "string" - ], + "type": ["null", "string"], "format": "date-time" }, "archived": { - "type": [ - "null", - "boolean" - ] + "type": ["null", "boolean"] }, "column_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "content_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "project_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] } } - } \ No newline at end of file +} \ No newline at end of file diff --git a/tap_github/schemas/project_columns.json b/tap_github/schemas/project_columns.json index 1ebe6782..87e72543 100644 --- a/tap_github/schemas/project_columns.json +++ b/tap_github/schemas/project_columns.json @@ -1,64 +1,34 @@ { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "_sdc_repository": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "project_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "cards_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "id": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "node_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "name": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "created_at": { - "type": [ - "null", - "string" - ], + "type": ["null", "string"], "format": "date-time" }, "updated_at": { - "type": [ - "null", - "string" - ], + "type": ["null", "string"], "format": "date-time" } } - } \ No newline at end of file +} diff --git a/tap_github/schemas/projects.json b/tap_github/schemas/projects.json index 3b4f5d89..e8d659ad 100644 --- a/tap_github/schemas/projects.json +++ b/tap_github/schemas/projects.json @@ -1,102 +1,117 @@ { - "type": [ - "null", - "object" - ], - "properties": { - "owner_url": { - "type": [ - "null", - "string" - ] - }, - "url": { - "type": [ - "null", - "string" - ] - }, - "html_url": { - "type": [ - "null", - "string" - ] - }, - "columns_url": { - "type": [ - "null", - "string" - ] - }, - "id": { - "type": [ - "null", - "number" - ] - }, - "node_id": { - "type": [ - "null", - "string" - ] - }, - "name": { - "type": [ - "null", - "string" - ] - }, - "body": { - "type": [ - "null", - "string" - ] - }, - "number": { - "type": [ - "null", - "number" - ] - }, - "state": { - "type": [ - "null", - "string" - ] - }, - "creator": { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], + "properties": { + "owner_url": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "columns_url": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "body": { + "type": ["null", "string"] + }, + "number": { + "type": ["null", "number"] + }, + "state": { + "type": ["null", "string"] + }, + "creator": { + "type": ["null", "object"], "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, "login": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "id": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] } } - }, - "created_at": { - "type": [ - "null", - "string" - ], - "format": "date-time" - }, - "updated_at": { - "type": [ - "null", - "string" - ], - "format": "date-time" - } + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "organization_permission": { + "type": ["null", "string"] + }, + "private": { + "type": ["null", "boolean"] } - } \ No newline at end of file + } +} \ No newline at end of file diff --git a/tap_github/schemas/pull_requests.json b/tap_github/schemas/pull_requests.json index 91c0c4bb..cbb0128d 100644 --- a/tap_github/schemas/pull_requests.json +++ b/tap_github/schemas/pull_requests.json @@ -8,6 +8,999 @@ "id": { "type": ["null", "string"] }, + "node_id": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "draft": { + "type": ["null", "boolean"] + }, + "requested_reviewers": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] + } + } + } + }, + "merge_commit_sha": { + "type": ["null", "string"] + }, + "review_comments_url": { + "type": ["null", "string"] + }, + "active_lock_reason": { + "type": ["null", "string"] + }, + "author_association": { + "type": ["null", "string"] + }, + "diff_url": { + "type": ["null", "string"] + }, + "assignee": { + "$ref": "shared/user.json#/" + }, + "comments_url": { + "type": ["null", "string"] + }, + "head": { + "type": ["null", "object"], + "properties": { + "label": { + "type": ["null", "string"] + }, + "ref": { + "type": ["null", "string"] + }, + "repo": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "license": { + "type": ["null", "object"], + "properties": { + "key": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "spdx_id": { + "type": ["null", "string"] + }, + "node_id": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + } + } + }, + "organization": { + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] + } + } + }, + "forks": { + "type": ["null", "integer"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "private": { + "type": ["null", "boolean"] + }, + "html_url": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] + }, + "url": { + "type": ["null", "string"] + }, + "archive_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "forks_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "language": { + "type": ["null", "string"] + }, + "forks_count": { + "type": ["null", "integer"] + }, + "stargazers_count": { + "type": ["null", "integer"] + }, + "watchers_count": { + "type": ["null", "integer"] + }, + "size": { + "type": ["null", "integer"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "open_issues_count": { + "type": ["null", "integer"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["null", "string"] + } + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "visibility": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "allow_rebase_merge": { + "type": ["null", "boolean"] + }, + "template_repository": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "private": { + "type": ["null", "boolean"] + }, + "html_url": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] + }, + "url": { + "type": ["null", "string"] + }, + "archive_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "forks_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "language": { + "type": ["null", "string"] + }, + "forks_count": { + "type": ["null", "integer"] + }, + "stargazers_count": { + "type": ["null", "integer"] + }, + "watchers_count": { + "type": ["null", "integer"] + }, + "size": { + "type": ["null", "integer"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "open_issues_count": { + "type": ["null", "integer"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["null", "string"] + } + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "visibility": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"] + }, + "created_at": { + "type": ["null", "string"] + }, + "updated_at": { + "type": ["null", "string"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "allow_rebase_merge": { + "type": ["null", "boolean"] + }, + "temp_clone_token": { + "type": ["null", "string"] + }, + "allow_squash_merge": { + "type": ["null", "boolean"] + }, + "allow_auto_merge": { + "type": ["null", "boolean"] + }, + "delete_branch_on_merge": { + "type": ["null", "boolean"] + }, + "allow_update_branch": { + "type": ["null", "boolean"] + }, + "use_squash_pr_title_as_default": { + "type": ["null", "boolean"] + }, + "allow_merge_commit": { + "type": ["null", "boolean"] + }, + "subscribers_count": { + "type": ["null", "integer"] + }, + "network_count": { + "type": ["null", "integer"] + } + } + }, + "temp_clone_token": { + "type": ["null", "string"] + }, + "allow_squash_merge": { + "type": ["null", "boolean"] + }, + "allow_auto_merge": { + "type": ["null", "boolean"] + }, + "delete_branch_on_merge": { + "type": ["null", "boolean"] + }, + "allow_update_branch": { + "type": ["null", "boolean"] + }, + "use_squash_pr_title_as_default": { + "type": ["null", "boolean"] + }, + "allow_merge_commit": { + "type": ["null", "boolean"] + }, + "allow_forking": { + "type": ["null", "boolean"] + }, + "subscribers_count": { + "type": ["null", "integer"] + }, + "network_count": { + "type": ["null", "integer"] + }, + "open_issues": { + "type": ["null", "integer"] + }, + "watchers": { + "type": ["null", "integer"] + }, + "master_branch": { + "type": ["null", "string"] + }, + "starred_at": { + "type": ["null", "string"] + } + } + }, + "sha": { + "type": ["null", "string"] + }, + "user": { + "$ref": "shared/user.json#/" + } + } + }, + "commits_url": { + "type": ["null", "string"] + }, + "auto_merge": { + "type": ["null", "object"], + "properties": { + "enabled_by": { + "$ref": "shared/user.json#/" + }, + "merge_method": { + "type": ["null", "string"] + }, + "commit_title": { + "type": ["null", "string"] + }, + "commit_message": { + "type": ["null", "string"] + } + } + }, + "locked": { + "type": ["null", "boolean"] + }, + "assignees": { + "type": ["null", "array"], + "items": { + "$ref": "shared/user.json#/" + } + }, + "issues_url": { + "type": ["null", "string"] + }, + "milestone": { + "type": ["null", "object"], + "properties": { + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "number": { + "type": ["null", "integer"] + }, + "state": { + "type": ["null", "string"] + }, + "title": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "creator": { + "$ref": "shared/user.json#/" + }, + "open_issues": { + "type": ["null", "integer"] + }, + "closed_issues": { + "type": ["null", "integer"] + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "closed_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "due_on": { + "type": ["null", "string"], + "format": "date-time" + } + } + }, + "_links": { + "type": ["null", "object"], + "properties": { + "comments": { + "type": ["null", "object"], + "properties": { + "href": { + "type": ["null", "string"] + } + } + }, + "commits": { + "type": ["null", "object"], + "properties": { + "href": { + "type": ["null", "string"] + } + } + }, + "statuses": { + "type": ["null", "object"], + "properties": { + "href": { + "type": ["null", "string"] + } + } + }, + "html": { + "type": ["null", "object"], + "properties": { + "href": { + "type": ["null", "string"] + } + } + }, + "issue": { + "type": ["null", "object"], + "properties": { + "href": { + "type": ["null", "string"] + } + } + }, + "review_comments": { + "type": ["null", "object"], + "properties": { + "href": { + "type": ["null", "string"] + } + } + }, + "review_comment": { + "type": ["null", "object"], + "properties": { + "href": { + "type": ["null", "string"] + } + } + }, + "self": { + "type": ["null", "object"], + "properties": { + "href": { + "type": ["null", "string"] + } + } + } + } + }, + "html_url": { + "type": ["null", "string"] + }, + "requested_teams": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "slug": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "privacy": { + "type": ["null", "string"] + }, + "permission": { + "type": ["null", "string"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "members_url": { + "type": ["null", "string"] + }, + "repositories_url": { + "type": ["null", "string"] + }, + "parent": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "members_url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "permission": { + "type": ["null", "string"] + }, + "privacy": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "repositories_url": { + "type": ["null", "string"] + }, + "slug": { + "type": ["null", "string"] + }, + "ldap_dn": { + "type": ["null", "string"] + } + } + } + } + } + }, + "patch_url": { + "type": ["null", "string"] + }, "url": { "type": ["null", "string"] }, @@ -53,42 +1046,583 @@ } }, "user": { - "type": ["null", "object"], - "additionalProperties": false, - "properties": { - "login": { - "type": ["null", "string"] - }, - "id": { - "type": ["null", "integer"] - } - } + "$ref": "shared/user.json#/" }, "base": { "type": ["null", "object"], "properties": { - "ref": { + "label": { "type": ["null", "string"] }, - "label": { + "ref": { "type": ["null", "string"] }, "repo": { "type": ["null", "object"], "properties": { "id": { - "type": [ "null", "integer" ] + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] }, "name": { - "type": [ "null", "string" ] + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "license": { + "type": ["null", "object"], + "properties": { + "key": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "spdx_id": { + "type": ["null", "string"] + }, + "node_id": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + } + } + }, + "organization": { + "$ref": "shared/user.json#/" + }, + "forks": { + "type": ["null", "integer"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "private": { + "type": ["null", "boolean"] + }, + "html_url": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] }, "url": { - "type": [ "null", "string" ] + "type": ["null", "string"] + }, + "archive_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "forks_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "language": { + "type": ["null", "string"] + }, + "forks_count": { + "type": ["null", "integer"] + }, + "stargazers_count": { + "type": ["null", "integer"] + }, + "watchers_count": { + "type": ["null", "integer"] + }, + "size": { + "type": ["null", "integer"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "open_issues_count": { + "type": ["null", "integer"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["null", "string"] + } + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "visibility": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "allow_rebase_merge": { + "type": ["null", "boolean"] + }, + "template_repository": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "private": { + "type": ["null", "boolean"] + }, + "html_url": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] + }, + "url": { + "type": ["null", "string"] + }, + "archive_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "forks_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "language": { + "type": ["null", "string"] + }, + "forks_count": { + "type": ["null", "integer"] + }, + "stargazers_count": { + "type": ["null", "integer"] + }, + "watchers_count": { + "type": ["null", "integer"] + }, + "size": { + "type": ["null", "integer"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "open_issues_count": { + "type": ["null", "integer"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["null", "string"] + } + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "visibility": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"] + }, + "created_at": { + "type": ["null", "string"] + }, + "updated_at": { + "type": ["null", "string"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "allow_rebase_merge": { + "type": ["null", "boolean"] + }, + "temp_clone_token": { + "type": ["null", "string"] + }, + "allow_squash_merge": { + "type": ["null", "boolean"] + }, + "allow_auto_merge": { + "type": ["null", "boolean"] + }, + "delete_branch_on_merge": { + "type": ["null", "boolean"] + }, + "allow_update_branch": { + "type": ["null", "boolean"] + }, + "use_squash_pr_title_as_default": { + "type": ["null", "boolean"] + }, + "allow_merge_commit": { + "type": ["null", "boolean"] + }, + "subscribers_count": { + "type": ["null", "integer"] + }, + "network_count": { + "type": ["null", "integer"] + } + } + }, + "temp_clone_token": { + "type": ["null", "string"] + }, + "allow_squash_merge": { + "type": ["null", "boolean"] + }, + "allow_auto_merge": { + "type": ["null", "boolean"] + }, + "delete_branch_on_merge": { + "type": ["null", "boolean"] + }, + "allow_update_branch": { + "type": ["null", "boolean"] + }, + "use_squash_pr_title_as_default": { + "type": ["null", "boolean"] + }, + "allow_merge_commit": { + "type": ["null", "boolean"] + }, + "allow_forking": { + "type": ["null", "boolean"] + }, + "subscribers_count": { + "type": ["null", "integer"] + }, + "network_count": { + "type": ["null", "integer"] + }, + "open_issues": { + "type": ["null", "integer"] + }, + "watchers": { + "type": ["null", "integer"] + }, + "master_branch": { + "type": ["null", "string"] + }, + "starred_at": { + "type": ["null", "string"] } } }, "sha": { "type": ["null", "string"] + }, + "user": { + "$ref": "shared/user.json#/" } } }, diff --git a/tap_github/schemas/releases.json b/tap_github/schemas/releases.json index 3b040003..b903a026 100644 --- a/tap_github/schemas/releases.json +++ b/tap_github/schemas/releases.json @@ -8,9 +8,151 @@ "id": { "type": ["null", "string"] }, + "node_id": { + "type": ["null", "string"] + }, "url": { "type": ["null", "string"] }, + "zipball_url": { + "type": ["null", "string"] + }, + "body_text": { + "type": ["null", "string"] + }, + "upload_url": { + "type": ["null", "string"] + }, + "assets_url": { + "type": ["null", "string"] + }, + "tarball_url": { + "type": ["null", "string"] + }, + "body_html": { + "type": ["null", "string"] + }, + "reactions": { + "$ref": "shared/reactions.json#/" + }, + "assets": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"], + "properties": { + "url": { + "type": ["null", "string"] + }, + "browser_download_url": { + "type": ["null", "string"], + "format": "uri" + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "label": { + "type": ["null", "string"] + }, + "state": { + "type": ["null", "string"] + }, + "content_type": { + "type": ["null", "string"] + }, + "size": { + "type": ["null", "integer"] + }, + "download_count": { + "type": ["null", "integer"] + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "uploader": { + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] + } + } + } + } + } + }, + "mentions_count": { + "type": ["null", "integer"] + }, "html_url": { "type": ["null", "string"] }, @@ -33,16 +175,7 @@ "type": ["null", "boolean"] }, "author": { - "type": ["null", "object"], - "additionalProperties": false, - "properties": { - "login": { - "type": ["null", "string"] - }, - "id": { - "type": ["null", "integer"] - } - } + "$ref": "shared/user.json#/" }, "created_at": { "type": ["null", "string"], @@ -51,6 +184,10 @@ "published_at": { "type": ["null", "string"], "format": "date-time" + }, + "discussion_url": { + "type": ["null", "string"], + "format": "date-time" } } } \ No newline at end of file diff --git a/tap_github/schemas/review_comments.json b/tap_github/schemas/review_comments.json index 71452419..8eae9585 100644 --- a/tap_github/schemas/review_comments.json +++ b/tap_github/schemas/review_comments.json @@ -9,21 +9,71 @@ "type": ["null", "integer"] }, "user": { + "$ref": "shared/user.json#/" + }, + "body": { + "type": ["null", "string"] + }, + "node_id": { + "type": ["null", "string"] + }, + "_links": { "type": ["null", "object"], - "additionalProperties": false, "properties": { - "login": { - "type": ["null", "string"] + "self": { + "type": ["null", "object"], + "properties": { + "href": { + "type": ["null", "string"] + } + } + }, + "html": { + "type": ["null", "object"], + "properties": { + "href": { + "type": ["null", "string"] + } + } }, - "id": { - "type": ["null", "integer"] + "pull_request": { + "type": ["null", "object"], + "properties": { + "href": { + "type": ["null", "string"] + } + } } } }, - "body": { + "url": { "type": ["null", "string"] }, - "node_id": { + "original_start_line": { + "type": ["null", "integer"] + }, + "start_side": { + "type": ["null", "string"] + }, + "body_text": { + "type": ["null", "string"] + }, + "original_line": { + "type": ["null", "integer"] + }, + "reactions": { + "$ref": "shared/reactions.json#/" + }, + "start_line": { + "type": ["null", "integer"] + }, + "body_html": { + "type": ["null", "string"] + }, + "line": { + "type": ["null", "integer"] + }, + "side": { "type": ["null", "string"] }, "pull_request_review_id": { @@ -88,9 +138,6 @@ "head": { "type": ["null", "string"] }, - "html_url": { - "type": ["null", "string"] - }, "issue_url": { "type": ["null", "string"] }, @@ -123,6 +170,9 @@ }, "statuses_url": { "type": ["null", "string"] + }, + "pr_id": { + "type": ["null", "string"] } } } diff --git a/tap_github/schemas/reviews.json b/tap_github/schemas/reviews.json index b7ad05f9..e065a74e 100644 --- a/tap_github/schemas/reviews.json +++ b/tap_github/schemas/reviews.json @@ -8,18 +8,42 @@ "id": { "type": ["null", "integer"] }, - "user": { + "_links": { "type": ["null", "object"], - "additionalProperties": false, "properties": { - "login": { - "type": ["null", "string"] + "html": { + "type": ["null", "object"], + "properties": { + "href": { + "type": ["null", "string"] + } + } }, - "id": { - "type": ["null", "integer"] + "pull_request": { + "type": ["null", "object"], + "properties": { + "href": { + "type": ["null", "string"] + } + } } } }, + "body_html": { + "type": ["null", "string"] + }, + "body_text": { + "type": ["null", "string"] + }, + "node_id": { + "type": ["null", "string"] + }, + "author_association": { + "type": ["null", "string"] + }, + "user": { + "$ref": "shared/user.json#/" + }, "body": { "type": ["null", "string"] }, @@ -38,6 +62,9 @@ "submitted_at": { "type": ["null", "string"], "format": "date-time" + }, + "pr_id": { + "type": ["null", "string"] } } } diff --git a/tap_github/schemas/shared/issue_permissions.json b/tap_github/schemas/shared/issue_permissions.json new file mode 100644 index 00000000..2ec35a46 --- /dev/null +++ b/tap_github/schemas/shared/issue_permissions.json @@ -0,0 +1,20 @@ +{ + "type": ["null", "object"], + "properties": { + "issues": { + "type": ["null", "string"] + }, + "checks": { + "type": ["null", "string"] + }, + "metadata": { + "type": ["null", "string"] + }, + "contents": { + "type": ["null", "string"] + }, + "deployments": { + "type": ["null", "string"] + } + } + } \ No newline at end of file diff --git a/tap_github/schemas/shared/performed_via_github_app.json b/tap_github/schemas/shared/performed_via_github_app.json new file mode 100644 index 00000000..eabc7b70 --- /dev/null +++ b/tap_github/schemas/shared/performed_via_github_app.json @@ -0,0 +1,61 @@ +{ + "type": ["null", "object", "string"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "slug": { + "type": ["null", "string"] + }, + "node_id": { + "type": ["null", "string"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "name": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "external_url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "permissions": { + "$ref": "shared/issue_permissions.json#/" + }, + "events": { + "type": ["null", "array"], + "items": { + "type": ["null", "string"] + } + }, + "installations_count": { + "type": ["null", "integer"] + }, + "client_id": { + "type": ["null", "string"] + }, + "client_secret": { + "type": ["null", "string"] + }, + "webhook_secret": { + "type": ["null", "string"] + }, + "pem": { + "type": ["null", "string"] + } + } +} \ No newline at end of file diff --git a/tap_github/schemas/shared/pull_permissions.json b/tap_github/schemas/shared/pull_permissions.json new file mode 100644 index 00000000..2eb4a910 --- /dev/null +++ b/tap_github/schemas/shared/pull_permissions.json @@ -0,0 +1,20 @@ +{ + "type": ["null", "object"], + "properties": { + "pull": { + "type": ["null", "boolean"] + }, + "triage": { + "type": ["null", "boolean"] + }, + "push": { + "type": ["null", "boolean"] + }, + "maintain": { + "type": ["null", "boolean"] + }, + "admin": { + "type": ["null", "boolean"] + } + } +} \ No newline at end of file diff --git a/tap_github/schemas/shared/reactions.json b/tap_github/schemas/shared/reactions.json new file mode 100644 index 00000000..543ae6ea --- /dev/null +++ b/tap_github/schemas/shared/reactions.json @@ -0,0 +1,35 @@ +{ + "type": ["null", "object"], + "properties": { + "url": { + "type": ["null", "string"] + }, + "total_count": { + "type": ["null", "integer"] + }, + "+1": { + "type": ["null", "integer"] + }, + "-1": { + "type": ["null", "integer"] + }, + "laugh": { + "type": ["null", "integer"] + }, + "confused": { + "type": ["null", "integer"] + }, + "heart": { + "type": ["null", "integer"] + }, + "hooray": { + "type": ["null", "integer"] + }, + "eyes": { + "type": ["null", "integer"] + }, + "rocket": { + "type": ["null", "integer"] + } + } +} \ No newline at end of file diff --git a/tap_github/schemas/shared/user.json b/tap_github/schemas/shared/user.json new file mode 100644 index 00000000..45c45d0c --- /dev/null +++ b/tap_github/schemas/shared/user.json @@ -0,0 +1,68 @@ +{ + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] + } + } +} diff --git a/tap_github/schemas/stargazers.json b/tap_github/schemas/stargazers.json index d8d67b30..54e2d9e3 100644 --- a/tap_github/schemas/stargazers.json +++ b/tap_github/schemas/stargazers.json @@ -6,13 +6,7 @@ "type": ["string"] }, "user": { - "type": ["null", "object"], - "additionalProperties": false, - "properties": { - "id": { - "type": ["null", "integer"] - } - } + "$ref": "shared/user.json#/" }, "starred_at": { "type": ["null", "string"], diff --git a/tap_github/schemas/team_members.json b/tap_github/schemas/team_members.json index b707c5e3..d872bafa 100644 --- a/tap_github/schemas/team_members.json +++ b/tap_github/schemas/team_members.json @@ -1,128 +1,74 @@ { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "login": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "id": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "node_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "starred_at": { + "type": ["null", "string"] }, "avatar_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "gravatar_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "html_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "followers_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "following_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "gists_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "starred_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "subscriptions_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "organizations_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "repos_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "events_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "received_events_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "type": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "site_admin": { - "type": [ - "null", - "boolean" - ] + "type": ["null", "boolean"] }, "_sdc_repository": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "team_slug": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] } } - } \ No newline at end of file +} diff --git a/tap_github/schemas/team_memberships.json b/tap_github/schemas/team_memberships.json index 98f80e25..1c6d89a6 100644 --- a/tap_github/schemas/team_memberships.json +++ b/tap_github/schemas/team_memberships.json @@ -1,32 +1,20 @@ { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "role": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "state": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] + }, + "login": { + "type": ["null", "string"] }, "_sdc_repository": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] } } } \ No newline at end of file diff --git a/tap_github/schemas/teams.json b/tap_github/schemas/teams.json index 43fdee44..15a26205 100644 --- a/tap_github/schemas/teams.json +++ b/tap_github/schemas/teams.json @@ -1,87 +1,47 @@ { - "type": [ - "null", - "object" - ], + "type": ["null", "object"], "properties": { "_sdc_repository": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "id": { - "type": [ - "null", - "number" - ] + "type": ["null", "number"] }, "node_id": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "html_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" }, "name": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "slug": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "description": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "privacy": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "permission": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "members_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "repositories_url": { - "type": [ - "null", - "string" - ] + "type": ["null", "string"] }, "parent": { - "type": [ - "null", - "object", - "string" - ] + "type": ["null", "object", "string"] } } } diff --git a/tap_github/streams.py b/tap_github/streams.py new file mode 100644 index 00000000..278dd05a --- /dev/null +++ b/tap_github/streams.py @@ -0,0 +1,768 @@ +from datetime import datetime +import singer +from singer import (metrics, bookmarks, metadata) + +LOGGER = singer.get_logger() +DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ' + +def get_bookmark(state, repo, stream_name, bookmark_key, start_date): + """ + Return bookmark value if available in the state otherwise return start date + """ + repo_stream_dict = bookmarks.get_bookmark(state, repo, stream_name) + if repo_stream_dict: + return repo_stream_dict.get(bookmark_key) + + return start_date + +def get_schema(catalog, stream_id): + """ + Return catalog of the specified stream. + """ + stream_catalog = [cat for cat in catalog if cat['tap_stream_id'] == stream_id ][0] + return stream_catalog + +def get_child_full_url(domain, child_object, repo_path, parent_id, grand_parent_id): + """ + Build the child stream's URL based on the parent and the grandparent's ids. + """ + + if child_object.use_repository: + # The `use_repository` represents that the url contains /repos and the repository name. + child_full_url = '{}/repos/{}/{}'.format( + domain, + repo_path, + child_object.path).format(*parent_id) + + elif child_object.use_organization: + # The `use_organization` represents that the url contains the organization name. + child_full_url = '{}/{}'.format( + domain, + child_object.path).format(repo_path, *parent_id, *grand_parent_id) + + else: + # Build and return url that does not contain the repos or the organization name. + # Example: https://base_url/projects/{project_id}/columns + child_full_url = '{}/{}'.format( + domain, + child_object.path).format(*grand_parent_id) + LOGGER.info("Final url is: %s", child_full_url) + + return child_full_url + + +class Stream: + """ + A base class representing tap-github streams. + """ + tap_stream_id = None + replication_method = None + replication_keys = None + key_properties = [] + path = None + filter_param = False + id_keys = [] + use_organization = False + children = [] + pk_child_fields = [] + use_repository = False + headers = {'Accept': '*/*'} + parent = None + + def build_url(self, base_url, repo_path, bookmark): + """ + Build the full url with parameters and attributes. + """ + if self.filter_param: + # Add the since parameter for incremental streams + query_string = '?since={}'.format(bookmark) + else: + query_string = '' + + if self.use_organization: + # The `use_organization` represents that the url contains the organization name. + full_url = '{}/{}'.format( + base_url, + self.path).format(repo_path) + else: + # The url that contains /repos and the repository name. + full_url = '{}/repos/{}/{}{}'.format( + base_url, + repo_path, + self.path, + query_string) + + LOGGER.info("Final url is: %s", full_url) + return full_url + + def get_min_bookmark(self, stream, selected_streams, bookmark, repo_path, start_date, state): + """ + Get the minimum bookmark from the parent and its corresponding child bookmarks. + """ + + stream_obj = STREAMS[stream]() + min_bookmark = bookmark + if stream in selected_streams: + # Get minimum of stream's bookmark(start date in case of no bookmark) and min_bookmark + min_bookmark = min(min_bookmark, get_bookmark(state, repo_path, stream, "since", start_date)) + LOGGER.debug("New minimum bookmark is %s", min_bookmark) + + for child in stream_obj.children: + # Iterate through all children and return minimum bookmark among all. + min_bookmark = min(min_bookmark, self.get_min_bookmark(child, selected_streams, min_bookmark, repo_path, start_date, state)) + + return min_bookmark + + def write_bookmarks(self, stream, selected_streams, bookmark_value, repo_path, state): + """Write the bookmark in the state corresponding to the stream.""" + stream_obj = STREAMS[stream]() + + # If the stream is selected, write the bookmark. + if stream in selected_streams: + singer.write_bookmark(state, repo_path, stream_obj.tap_stream_id, {"since": bookmark_value}) + + # For the each child, write the bookmark if it is selected. + for child in stream_obj.children: + self.write_bookmarks(child, selected_streams, bookmark_value, repo_path, state) + + # pylint: disable=no-self-use + def get_child_records(self, + client, + catalog, + child_stream, + grand_parent_id, + repo_path, + state, + start_date, + bookmark_dttm, + stream_to_sync, + selected_stream_ids, + parent_id = None, + parent_record = None): + """ + Retrieve and write all the child records for each updated parent based on the parent record and its ids. + """ + child_object = STREAMS[child_stream]() + + child_bookmark_value = get_bookmark(state, repo_path, child_object.tap_stream_id, "since", start_date) + + if not parent_id: + parent_id = grand_parent_id + + child_full_url = get_child_full_url(client.base_url, child_object, repo_path, parent_id, grand_parent_id) + stream_catalog = get_schema(catalog, child_object.tap_stream_id) + + with metrics.record_counter(child_object.tap_stream_id) as counter: + for response in client.authed_get_all_pages( + child_object.tap_stream_id, + child_full_url, + stream = child_object.tap_stream_id + ): + records = response.json() + extraction_time = singer.utils.now() + + if isinstance(records, list): + # Loop through all the records of response + for record in records: + record['_sdc_repository'] = repo_path + child_object.add_fields_at_1st_level(record = record, parent_record = parent_record) + + with singer.Transformer() as transformer: + + rec = transformer.transform(record, stream_catalog['schema'], metadata=metadata.to_map(stream_catalog['metadata'])) + + if child_object.tap_stream_id in selected_stream_ids and record.get(child_object.replication_keys, start_date) >= child_bookmark_value: + singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) + counter.increment() + + # Loop thru each child and nested child in the parent and fetch all the child records. + for nested_child in child_object.children: + if nested_child in stream_to_sync: + # Collect id of child record to pass in the API of its sub-child. + child_id = tuple(record.get(key) for key in STREAMS[nested_child]().id_keys) + # Here, grand_parent_id is the id of 1st level parent(main parent) which is required to + # pass in the API of the current child's sub-child. + child_object.get_child_records(client, catalog, nested_child, child_id, repo_path, state, start_date, bookmark_dttm, stream_to_sync, selected_stream_ids, grand_parent_id, record) + + else: + # Write JSON response directly if it is a single record only. + records['_sdc_repository'] = repo_path + child_object.add_fields_at_1st_level(record = records, parent_record = parent_record) + + with singer.Transformer() as transformer: + + rec = transformer.transform(records, stream_catalog['schema'], metadata=metadata.to_map(stream_catalog['metadata'])) + if child_object.tap_stream_id in selected_stream_ids and records.get(child_object.replication_keys, start_date) >= child_bookmark_value : + + singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) + + # pylint: disable=unnecessary-pass + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + pass + +class FullTableStream(Stream): + def sync_endpoint(self, + client, + state, + catalog, + repo_path, + start_date, + selected_stream_ids, + stream_to_sync + ): + """ + A common function sync full table streams. + """ + + # build full url + full_url = self.build_url(client.base_url, repo_path, None) + + stream_catalog = get_schema(catalog, self.tap_stream_id) + + with metrics.record_counter(self.tap_stream_id) as counter: + for response in client.authed_get_all_pages( + self.tap_stream_id, + full_url, + self.headers, + stream = self.tap_stream_id + ): + records = response.json() + extraction_time = singer.utils.now() + # Loop through all records + for record in records: + + record['_sdc_repository'] = repo_path + self.add_fields_at_1st_level(record = record, parent_record = None) + + with singer.Transformer() as transformer: + rec = transformer.transform(record, stream_catalog['schema'], metadata=metadata.to_map(stream_catalog['metadata'])) + if self.tap_stream_id in selected_stream_ids: + + singer.write_record(self.tap_stream_id, rec, time_extracted=extraction_time) + + counter.increment() + + for child in self.children: + if child in stream_to_sync: + + parent_id = tuple(record.get(key) for key in STREAMS[child]().id_keys) + + # Sync child stream, if it is selected or its nested child is selected. + self.get_child_records(client, + catalog, + child, + parent_id, + repo_path, + state, + start_date, + record.get(self.replication_keys), + stream_to_sync, + selected_stream_ids, + parent_record = record) + + return state + +class IncrementalStream(Stream): + def sync_endpoint(self, + client, + state, + catalog, + repo_path, + start_date, + selected_stream_ids, + stream_to_sync + ): + + """ + A common function sync incremental streams. Sync an incremental stream for which records are not + in descending order. For, incremental streams iterate all records, write only newly updated records and + write the latest bookmark value. + """ + + parent_bookmark_value = get_bookmark(state, repo_path, self.tap_stream_id, "since", start_date) + current_time = datetime.today().strftime(DATE_FORMAT) + min_bookmark_value = self.get_min_bookmark(self.tap_stream_id, selected_stream_ids, current_time, repo_path, start_date, state) + + max_bookmark_value = min_bookmark_value + + # build full url + full_url = self.build_url(client.base_url, repo_path, min_bookmark_value) + + stream_catalog = get_schema(catalog, self.tap_stream_id) + + with metrics.record_counter(self.tap_stream_id) as counter: + for response in client.authed_get_all_pages( + self.tap_stream_id, + full_url, + self.headers, + stream = self.tap_stream_id + ): + records = response.json() + extraction_time = singer.utils.now() + # Loop through all records + for record in records: + + record['_sdc_repository'] = repo_path + self.add_fields_at_1st_level(record = record, parent_record = None) + + with singer.Transformer() as transformer: + if record.get(self.replication_keys): + if record[self.replication_keys] >= max_bookmark_value: + # Update max_bookmark_value + max_bookmark_value = record[self.replication_keys] + + bookmark_dttm = record[self.replication_keys] + + # Keep only records whose bookmark is after the last_datetime + if bookmark_dttm >= min_bookmark_value: + + if self.tap_stream_id in selected_stream_ids and bookmark_dttm >= parent_bookmark_value: + rec = transformer.transform(record, stream_catalog['schema'], metadata=metadata.to_map(stream_catalog['metadata'])) + + singer.write_record(self.tap_stream_id, rec, time_extracted=extraction_time) + counter.increment() + + for child in self.children: + if child in stream_to_sync: + + parent_id = tuple(record.get(key) for key in STREAMS[child]().id_keys) + + # Sync child stream, if it is selected or its nested child is selected. + self.get_child_records(client, + catalog, + child, + parent_id, + repo_path, + state, + start_date, + record.get(self.replication_keys), + stream_to_sync, + selected_stream_ids, + parent_record = record) + else: + LOGGER.warning("Skipping this record for %s stream with %s = %s as it is missing replication key %s.", + self.tap_stream_id, self.key_properties, record[self.key_properties], self.replication_keys) + + + # Write bookmark for incremental stream. + self.write_bookmarks(self.tap_stream_id, selected_stream_ids, max_bookmark_value, repo_path, state) + + return state + +class IncrementalOrderedStream(Stream): + + def sync_endpoint(self, + client, + state, + catalog, + repo_path, + start_date, + selected_stream_ids, + stream_to_sync + ): + """ + A sync function for streams that have records in the descending order of replication key value. For such streams, + iterate only the latest records. + """ + bookmark_value = get_bookmark(state, repo_path, self.tap_stream_id, "since", start_date) + current_time = datetime.today().strftime(DATE_FORMAT) + + min_bookmark_value = self.get_min_bookmark(self.tap_stream_id, selected_stream_ids, current_time, repo_path, start_date, state) + bookmark_time = singer.utils.strptime_to_utc(min_bookmark_value) + + # Build full url + full_url = self.build_url(client.base_url, repo_path, bookmark_value) + synced_all_records = False + stream_catalog = get_schema(catalog, self.tap_stream_id) + + parent_bookmark_value = bookmark_value + record_counter = 0 + with metrics.record_counter(self.tap_stream_id) as counter: + for response in client.authed_get_all_pages( + self.tap_stream_id, + full_url, + stream = self.tap_stream_id + ): + records = response.json() + extraction_time = singer.utils.now() + for record in records: + record['_sdc_repository'] = repo_path + self.add_fields_at_1st_level(record = record, parent_record = None) + + updated_at = record.get(self.replication_keys) + + if record_counter == 0 and updated_at > bookmark_value: + # Consider replication key value of 1st record as bookmark value. + # Because all records are in descending order of replication key value + bookmark_value = updated_at + record_counter = record_counter + 1 + + if updated_at: + if bookmark_time and singer.utils.strptime_to_utc(updated_at) < bookmark_time: + # Skip all records from now onwards because the bookmark value of the current record is less than + # last saved bookmark value and all records from now onwards will have bookmark value less than last + # saved bookmark value. + synced_all_records = True + break + + if self.tap_stream_id in selected_stream_ids and updated_at >= parent_bookmark_value: + + # Transform and write record + with singer.Transformer() as transformer: + rec = transformer.transform(record, stream_catalog['schema'], metadata=metadata.to_map(stream_catalog['metadata'])) + singer.write_record(self.tap_stream_id, rec, time_extracted=extraction_time) + counter.increment() + + for child in self.children: + if child in stream_to_sync: + parent_id = tuple(record.get(key) for key in STREAMS[child]().id_keys) + + # Sync child stream, if it is selected or its nested child is selected. + self.get_child_records(client, + catalog, + child, + parent_id, + repo_path, + state, + start_date, + record.get(self.replication_keys), + stream_to_sync, + selected_stream_ids, + parent_record = record) + else: + LOGGER.warning("Skipping this record for %s stream with %s = %s as it is missing replication key %s.", + self.tap_stream_id, self.key_properties, record[self.key_properties], self.replication_keys) + + if synced_all_records: + break + + # Write bookmark for incremental stream. + self.write_bookmarks(self.tap_stream_id, selected_stream_ids, bookmark_value, repo_path, state) + + return state + +class Reviews(IncrementalStream): + ''' + https://docs.github.com/en/rest/reference/pulls#list-reviews-for-a-pull-request + ''' + tap_stream_id = "reviews" + replication_method = "INCREMENTAL" + replication_keys = "submitted_at" + key_properties = ["id"] + path = "pulls/{}/reviews" + use_repository = True + id_keys = ['number'] + parent = 'pull_requests' + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + record['pr_id'] = parent_record['id'] + +class ReviewComments(IncrementalOrderedStream): + ''' + https://docs.github.com/en/rest/pulls/comments#get-a-review-comment-for-a-pull-request + ''' + tap_stream_id = "review_comments" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" + key_properties = ["id"] + path = "pulls/{}/comments?sort=updated_at&direction=desc" + use_repository = True + id_keys = ['number'] + parent = 'pull_requests' + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + record['pr_id'] = parent_record['id'] + +class PRCommits(IncrementalStream): + ''' + https://docs.github.com/en/rest/reference/pulls#list-commits-on-a-pull-request + ''' + tap_stream_id = "pr_commits" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" + key_properties = ["id"] + path = "pulls/{}/commits" + use_repository = True + id_keys = ['number'] + parent = 'pull_requests' + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + record['updated_at'] = record['commit']['committer']['date'] + + record['pr_number'] = parent_record.get('number') + record['pr_id'] = parent_record.get('id') + record['id'] = '{}-{}'.format(parent_record.get('id'), record.get('sha')) + +class PullRequests(IncrementalOrderedStream): + ''' + https://developer.github.com/v3/pulls/#list-pull-requests + ''' + tap_stream_id = "pull_requests" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" + key_properties = ["id"] + path = "pulls?state=all&sort=updated&direction=desc" + children = ['reviews', 'review_comments', 'pr_commits'] + pk_child_fields = ["number"] + +class ProjectCards(IncrementalStream): + ''' + https://docs.github.com/en/rest/reference/projects#list-project-cards + ''' + tap_stream_id = "project_cards" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" + key_properties = ["id"] + path = "projects/columns/{}/cards" + tap_stream_id = "project_cards" + parent = 'project_columns' + id_keys = ['id'] + +class ProjectColumns(IncrementalStream): + ''' + https://docs.github.com/en/rest/reference/projects#list-project-columns + ''' + tap_stream_id = "project_columns" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" + key_properties = ["id"] + path = "projects/{}/columns" + children = ["project_cards"] + parent = "projects" + id_keys = ['id'] + has_children = True + +class Projects(IncrementalStream): + ''' + https://docs.github.com/en/rest/reference/projects#list-repository-projects + ''' + tap_stream_id = "projects" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" + key_properties = ["id"] + path = "projects?state=all" + tap_stream_id = "projects" + children = ["project_columns"] + child_objects = [ProjectColumns()] + +class TeamMemberships(FullTableStream): + ''' + https://docs.github.com/en/rest/reference/teams#get-team-membership-for-a-user + ''' + tap_stream_id = "team_memberships" + replication_method = "FULL_TABLE" + key_properties = ["url"] + path = "orgs/{}/teams/{}/memberships/{}" + use_organization = True + parent = 'team_members' + id_keys = ["login"] + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + record['login'] = parent_record['login'] + +class TeamMembers(FullTableStream): + ''' + https://docs.github.com/en/rest/reference/teams#list-team-members + ''' + tap_stream_id = "team_members" + replication_method = "FULL_TABLE" + key_properties = ["team_slug", "id"] + path = "orgs/{}/teams/{}/members" + use_organization = True + id_keys = ['slug'] + children= ["team_memberships"] + has_children = True + parent = 'teams' + pk_child_fields = ['login'] + + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + record['team_slug'] = parent_record['slug'] + +class Teams(FullTableStream): + ''' + https://docs.github.com/en/rest/reference/teams#list-teams + ''' + tap_stream_id = "teams" + replication_method = "FULL_TABLE" + key_properties = ["id"] + path = "orgs/{}/teams" + use_organization = True + children= ["team_members"] + pk_child_fields = ['slug'] + +class Commits(IncrementalStream): + ''' + https://docs.github.com/en/rest/commits/commits#list-commits-on-a-repository + ''' + tap_stream_id = "commits" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" + key_properties = ["sha"] + path = "commits" + filter_param = True + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + record['updated_at'] = record['commit']['committer']['date'] + +class Comments(IncrementalOrderedStream): + ''' + https://docs.github.com/en/rest/issues/comments#list-comments-in-a-repository + ''' + tap_stream_id = "comments" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" + key_properties = ["id"] + filter_param = True + path = "issues/comments?sort=updated&direction=desc" + +class Issues(IncrementalOrderedStream): + ''' + https://docs.github.com/en/rest/issues/issues#list-repository-issues + ''' + tap_stream_id = "issues" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" + key_properties = ["id"] + filter_param = True + path = "issues?state=all&sort=updated&direction=desc" + +class Assignees(FullTableStream): + ''' + https://docs.github.com/en/rest/issues/assignees#list-assignees + ''' + tap_stream_id = "assignees" + replication_method = "FULL_TABLE" + key_properties = ["id"] + path = "assignees" + +class Releases(FullTableStream): + ''' + https://docs.github.com/en/rest/releases/releases#list-releases + ''' + tap_stream_id = "releases" + replication_method = "FULL_TABLE" + key_properties = ["id"] + path = "releases?sort=created_at&direction=desc" + +class IssueLabels(FullTableStream): + ''' + https://docs.github.com/en/rest/issues/labels#list-labels-for-a-repository + ''' + tap_stream_id = "issue_labels" + replication_method = "FULL_TABLE" + key_properties = ["id"] + path = "labels" + +class IssueEvents(IncrementalOrderedStream): + ''' + https://docs.github.com/en/rest/reference/issues#list-issue-events-for-a-repository + ''' + tap_stream_id = "issue_events" + replication_method = "INCREMENTAL" + replication_keys = "created_at" + key_properties = ["id"] + path = "issues/events?sort=created_at&direction=desc" + +class Events(IncrementalStream): + ''' + https://docs.github.com/en/rest/activity/events#list-repository-events + ''' + tap_stream_id = "events" + replication_method = "INCREMENTAL" + replication_keys = "created_at" + key_properties = ["id"] + path = "events" + +class CommitComments(IncrementalStream): + ''' + https://docs.github.com/en/rest/commits/comments#list-commit-comments-for-a-repository + ''' + tap_stream_id = "commit_comments" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" + key_properties = ["id"] + path = "comments" + +class IssueMilestones(IncrementalOrderedStream): + ''' + https://docs.github.com/en/rest/issues/milestones#list-milestones + ''' + tap_stream_id = "issue_milestones" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" + key_properties = ["id"] + path = "milestones?direction=desc&sort=updated_at" + +class Collaborators(FullTableStream): + ''' + https://docs.github.com/en/rest/collaborators/collaborators#list-repository-collaborators + ''' + tap_stream_id = "collaborators" + replication_method = "FULL_TABLE" + key_properties = ["id"] + path = "collaborators" + +class StarGazers(FullTableStream): + ''' + https://docs.github.com/en/rest/activity/starring#list-stargazers + ''' + tap_stream_id = "stargazers" + replication_method = "FULL_TABLE" + key_properties = ["user_id"] + path = "stargazers" + headers = {'Accept': 'application/vnd.github.v3.star+json'} + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + record['user_id'] = record['user']['id'] + + +# Dictionary of the stream classes +STREAMS = { + "commits": Commits, + "comments": Comments, + "issues": Issues, + "assignees": Assignees, + "releases": Releases, + "issue_labels": IssueLabels, + "issue_events": IssueEvents, + "events": Events, + "commit_comments": CommitComments, + "issue_milestones": IssueMilestones, + "projects": Projects, + "project_columns": ProjectColumns, + "project_cards": ProjectCards, + "pull_requests": PullRequests, + "reviews": Reviews, + "review_comments": ReviewComments, + "pr_commits": PRCommits, + "teams": Teams, + "team_members": TeamMembers, + "team_memberships": TeamMemberships, + "collaborators": Collaborators, + "stargazers": StarGazers +} diff --git a/tap_github/sync.py b/tap_github/sync.py new file mode 100644 index 00000000..a83610ad --- /dev/null +++ b/tap_github/sync.py @@ -0,0 +1,236 @@ +import collections +import singer +from singer import bookmarks +from tap_github.streams import STREAMS + +LOGGER = singer.get_logger() +STREAM_TO_SYNC_FOR_ORGS = ['teams', 'team_members', 'team_memberships'] + +def get_selected_streams(catalog): + ''' + Gets selected streams. Checks schema's 'selected' + first -- and then checks metadata, looking for an empty + breadcrumb and mdata with a 'selected' entry + ''' + selected_streams = [] + for stream in catalog['streams']: + stream_metadata = stream['metadata'] + for entry in stream_metadata: + # Stream metadata will have an empty breadcrumb + if not entry['breadcrumb'] and entry['metadata'].get('selected',None): + selected_streams.append(stream['tap_stream_id']) + + return selected_streams + +def update_currently_syncing(state, stream_name): + """ + Updates currently syncing stream in the state. + """ + if not stream_name and singer.get_currently_syncing(state): + del state['currently_syncing'] + else: + singer.set_currently_syncing(state, stream_name) + singer.write_state(state) + +def update_currently_syncing_repo(state, repo_path): + """ + Updates currently syncing repository in the state. + and flushes `currently_syncing_repo` when all repositories are synced. + """ + if (not repo_path) and ('currently_syncing_repo' in state): + del state['currently_syncing_repo'] + else: + state['currently_syncing_repo'] = repo_path + singer.write_state(state) + +def get_ordered_stream_list(currently_syncing, streams_to_sync): + """ + Get an ordered list of remaining streams to sync other streams followed by synced streams. + """ + stream_list = list(sorted(streams_to_sync)) + if currently_syncing in stream_list: + index = stream_list.index(currently_syncing) + stream_list = stream_list[index:] + stream_list[:index] + return stream_list + +def get_ordered_repos(state, repositories): + """ + Get an ordered list of remaining repos to sync followed by synced repos. + """ + syncing_repo = state.get("currently_syncing_repo") + if syncing_repo in repositories: + index = repositories.index(syncing_repo) + repositories = repositories[index:] + repositories[:index] + return repositories + +def translate_state(state, catalog, repositories): + ''' + This tap used to only support a single repository, in which case the + the state took the shape of: + { + "bookmarks": { + "commits": { + "since": "2018-11-14T13:21:20.700360Z" + } + } + } + The tap now supports multiple repos, so this function should be called + at the beginning of each run to ensure the state is translated to the + new format: + { + "bookmarks": { + "singer-io/tap-adwords": { + "commits": { + "since": "2018-11-14T13:21:20.700360Z" + } + } + "singer-io/tap-salesforce": { + "commits": { + "since": "2018-11-14T13:21:20.700360Z" + } + } + } + } + ''' + nested_dict = lambda: collections.defaultdict(nested_dict) + new_state = nested_dict() + + # Collect keys(repo_name for update state or stream_name for older state) from state available in the `bookmarks`` + previous_state_keys = state.get('bookmarks', {}).keys() + # Collect stream names from the catalog + stream_names = [stream['tap_stream_id'] for stream in catalog['streams']] + + for key in previous_state_keys: + # Loop through each key of `bookmarks` available in the previous state. + + # Case 1: + # Older connections `bookmarks` contain stream names so check if it is the stream name or not. + # If the previous state's key is found in the stream name list then continue to check other keys. Because we want + # to migrate each stream's bookmark into the repo name as mentioned below: + # Example: {`bookmarks`: {`stream_a`: `bookmark_a`}} to {`bookmarks`: {`repo_a`: {`stream_a`: `bookmark_a`}}} + + # Case 2: + # Check if the key is available in the list of currently selected repo's list or not. Newer format `bookmarks` contain repo names. + # Return the state if the previous state's key is not found in the repo name list or stream name list. + + # If the state contains a bookmark for `repo_a` and `repo_b` and the user deselects these both repos and adds another repo + # then in that case this function was returning an empty state. Now this change will return the existing state instead of the empty state. + if key not in stream_names and key not in repositories: + # Return the existing state if all repos from the previous state are deselected(not found) in the current sync. + return state + + for stream in catalog['streams']: + stream_name = stream['tap_stream_id'] + for repo in repositories: + if bookmarks.get_bookmark(state, repo, stream_name): + return state + if bookmarks.get_bookmark(state, stream_name, 'since'): + new_state['bookmarks'][repo][stream_name]['since'] = bookmarks.get_bookmark(state, stream_name, 'since') + + return new_state + +def get_stream_to_sync(catalog): + """ + Get the streams for which the sync function should be called(the parent in case of selected child streams). + """ + streams_to_sync = [] + selected_streams = get_selected_streams(catalog) + for stream_name, stream_obj in STREAMS.items(): + if stream_name in selected_streams or is_any_child_selected(stream_obj, selected_streams): + # Append the selected stream or deselected parent stream into the list, if its child or nested child is selected. + streams_to_sync.append(stream_name) + return streams_to_sync + +def is_any_child_selected(stream_obj,selected_streams): + """ + Check if any of the child streams is selected for the parent. + """ + if stream_obj.children: + for child in stream_obj.children: + if child in selected_streams: + return True + + if STREAMS[child].children: + return is_any_child_selected(STREAMS[child], selected_streams) + return False + +def write_schemas(stream_id, catalog, selected_streams): + """ + Write the schemas for each stream. + """ + stream_obj = STREAMS[stream_id]() + + if stream_id in selected_streams: + # Get catalog object for particular stream. + stream = [cat for cat in catalog['streams'] if cat['tap_stream_id'] == stream_id ][0] + singer.write_schema(stream_id, stream['schema'], stream['key_properties']) + + for child in stream_obj.children: + write_schemas(child, catalog, selected_streams) + +def sync(client, config, state, catalog): + """ + Sync selected streams. + """ + + start_date = config['start_date'] + + # Get selected streams, make sure stream dependencies are met + selected_stream_ids = get_selected_streams(catalog) + + streams_to_sync = get_stream_to_sync(catalog) + LOGGER.info('Sync stream %s', streams_to_sync) + + repositories, organizations = client.extract_repos_from_config() + + state = translate_state(state, catalog, repositories) + singer.write_state(state) + + # Sync `teams`, `team_members`and `team_memberships` streams just single time for any organization. + streams_to_sync_for_orgs = set(streams_to_sync).intersection(STREAM_TO_SYNC_FOR_ORGS) + # Loop through all organizations + if selected_stream_ids: + for orgs in organizations: + LOGGER.info("Starting sync of organization: %s", orgs) + do_sync(catalog, streams_to_sync_for_orgs, selected_stream_ids, client, start_date, state, orgs) + + # Sync other streams for all repos + streams_to_sync_for_repos = set(streams_to_sync) - streams_to_sync_for_orgs + # pylint: disable=too-many-nested-blocks + # Sync repositories only if any streams are selected + for repo in get_ordered_repos(state, repositories): + update_currently_syncing_repo(state, repo) + LOGGER.info("Starting sync of repository: %s", repo) + do_sync(catalog, streams_to_sync_for_repos, selected_stream_ids, client, start_date, state, repo) + + if client.not_accessible_repos: + # Give warning messages for a repo that is not accessible by a stream or is invalid. + message = "Please check the repository name \'{}\' or you do not have sufficient permissions to access this repository for following streams {}.".format(repo, ", ".join(client.not_accessible_repos)) + LOGGER.warning(message) + client.not_accessible_repos = set() + update_currently_syncing_repo(state, None) + +def do_sync(catalog, streams_to_sync, selected_stream_ids, client, start_date, state, repo): + """ + Sync all other streams except teams, team_members and team_memberships for each repo. + """ + currently_syncing = singer.get_currently_syncing(state) + for stream_id in get_ordered_stream_list(currently_syncing, streams_to_sync): + stream_obj = STREAMS[stream_id]() + + # If it is a "sub_stream", it will be synced as part of the parent stream + if stream_id in streams_to_sync and not stream_obj.parent: + write_schemas(stream_id, catalog, selected_stream_ids) + update_currently_syncing(state, stream_id) + + state = stream_obj.sync_endpoint(client = client, + state = state, + catalog = catalog['streams'], + repo_path = repo, + start_date = start_date, + selected_stream_ids = selected_stream_ids, + stream_to_sync = streams_to_sync + ) + + singer.write_state(state) + update_currently_syncing(state, None) diff --git a/tests/base.py b/tests/base.py index 33c0478a..1d9eeb2f 100644 --- a/tests/base.py +++ b/tests/base.py @@ -4,9 +4,7 @@ from datetime import timedelta import time -import tap_tester.menagerie as menagerie -import tap_tester.connections as connections -import tap_tester.runner as runner +from tap_tester import menagerie, runner, connections, LOGGER class TestGithubBase(unittest.TestCase): @@ -15,14 +13,17 @@ class TestGithubBase(unittest.TestCase): INCREMENTAL = "INCREMENTAL" FULL = "FULL_TABLE" BOOKMARK = "bookmark" + PK_CHILD_FIELDS = "pk_child_fields" START_DATE_FORMAT = "%Y-%m-%dT00:00:00Z" # %H:%M:%SZ + BOOKMARK_FORMAT = "%Y-%m-%dT%H:%M:%SZ" + RECORD_REPLICATION_KEY_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" + EVENTS_RECORD_REPLICATION_KEY_FORMAT = "%Y-%m-%dT%H:%M:%SZ" DATETIME_FMT = { "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S.000000Z" } START_DATE = "" - FULL_TABLE_SUB_STREAMS = ['reviews', 'review_comments', 'pr_commits', 'team_members', 'team_memberships'] OBEYS_START_DATE = "obey-start-date" def setUp(self): @@ -151,7 +152,8 @@ def expected_metadata(self): self.PRIMARY_KEYS: {"id"}, self.REPLICATION_METHOD: self.INCREMENTAL, self.BOOKMARK: {"updated_at"}, - self.OBEYS_START_DATE: True + self.OBEYS_START_DATE: True, + self.PK_CHILD_FIELDS: {"number"} }, "releases": { self.PRIMARY_KEYS: {"id"}, @@ -178,7 +180,8 @@ def expected_metadata(self): "team_members": { self.PRIMARY_KEYS: {"id", "team_slug"}, self.REPLICATION_METHOD: self.FULL, - self.OBEYS_START_DATE: False + self.OBEYS_START_DATE: False, + self.PK_CHILD_FIELDS: {"login"} }, "team_memberships": { self.PRIMARY_KEYS: {"url"}, @@ -188,12 +191,16 @@ def expected_metadata(self): "teams": { self.PRIMARY_KEYS: {"id"}, self.REPLICATION_METHOD: self.FULL, - self.OBEYS_START_DATE: False + self.OBEYS_START_DATE: False, + self.PK_CHILD_FIELDS: {"slug"} } } def expected_replication_method(self): - """return a dictionary with key of table name and value of replication method""" + """ + Return a dictionary with key of table name + and value of replication method + """ return {table: properties.get(self.REPLICATION_METHOD, None) for table, properties in self.expected_metadata().items()} @@ -212,7 +219,7 @@ def expected_streams(self): def expected_primary_keys(self): """ - return a dictionary with key of table name + Return a dictionary with the key of the table name and value as a set of primary key fields """ return {table: properties.get(self.PRIMARY_KEYS, set()) @@ -220,7 +227,8 @@ def expected_primary_keys(self): in self.expected_metadata().items()} def expected_bookmark_keys(self): - """return a dictionary with key of table name + """ + Return a dictionary with the key of the table name and value as a set of bookmark key fields """ return {table: properties.get(self.BOOKMARK, set()) @@ -229,13 +237,32 @@ def expected_bookmark_keys(self): def expected_foreign_keys(self): """ - return dictionary with key of table name and - value is set of foreign keys + Return dictionary with the key of table name and + value is a set of foreign keys """ return {} + def expected_child_pk_keys(self): + """ + Return a dictionary with key of table name + and value as a set of child streams primary key fields + which are not automatic in parent streams + """ + return {table: properties.get(self.PK_CHILD_FIELDS, set()) + for table, properties + in self.expected_metadata().items()} + + def expected_automatic_keys(self): + """ + Return a dictionary with the key of the table name + and value as a set of automatic key fields + """ + return {table: ((self.expected_primary_keys().get(table) or set()) | + (self.expected_bookmark_keys().get(table) or set()) | + (self.expected_child_pk_keys().get(table) or set())) + for table in self.expected_metadata()} - ######################### + ######################### # Helper Methods # ######################### @@ -245,10 +272,10 @@ def run_and_verify_check_mode(self, conn_id): This should be ran prior to field selection and initial sync. Return the connection id and found catalogs from menagerie. """ - # run in check mode + # Run in check mode check_job_name = runner.run_check_mode(self, conn_id) - # verify check exit codes + # Verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) @@ -256,9 +283,9 @@ def run_and_verify_check_mode(self, conn_id): self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['stream_name'], found_catalogs)) - print(found_catalog_names) + LOGGER.info(found_catalog_names) self.assertSetEqual(self.expected_streams(), found_catalog_names, msg="discovered schemas do not match") - print("discovered schemas are OK") + LOGGER.info("discovered schemas are OK") return found_catalogs @@ -282,7 +309,7 @@ def run_and_verify_sync(self, conn_id): sum(sync_record_count.values()), 0, msg="failed to replicate any data: {}".format(sync_record_count) ) - print("total replicated row count: {}".format(sum(sync_record_count.values()))) + LOGGER.info("total replicated row count: {}".format(sum(sync_record_count.values()))) return sync_record_count @@ -311,7 +338,7 @@ def perform_and_verify_table_and_field_selection(self, # Verify all testable streams are selected selected = catalog_entry.get('annotated-schema').get('selected') - print("Validating selection on {}: {}".format(cat['stream_name'], selected)) + LOGGER.info("Validating selection on {}: {}".format(cat['stream_name'], selected)) if cat['stream_name'] not in expected_selected: self.assertFalse(selected, msg="Stream selected, but not testable.") continue # Skip remaining assertions if we aren't selecting this stream @@ -321,14 +348,14 @@ def perform_and_verify_table_and_field_selection(self, # Verify all fields within each selected stream are selected for field, field_props in catalog_entry.get('annotated-schema').get('properties').items(): field_selected = field_props.get('selected') - print("\tValidating selection on {}.{}: {}".format( + LOGGER.info("\tValidating selection on {}.{}: {}".format( cat['stream_name'], field, field_selected)) self.assertTrue(field_selected, msg="Field not selected.") else: # Verify only automatic fields are selected - expected_automatic_fields = self.expected_primary_keys().get(cat['stream_name']) + expected_automatic_keys = self.expected_automatic_keys().get(cat['stream_name']) selected_fields = self.get_selected_fields_from_metadata(catalog_entry['metadata']) - self.assertEqual(expected_automatic_fields, selected_fields) + self.assertEqual(expected_automatic_keys, selected_fields) @staticmethod def get_selected_fields_from_metadata(metadata): @@ -352,7 +379,7 @@ def select_all_streams_and_fields(conn_id, catalogs, select_all_fields: bool = T non_selected_properties = [] if not select_all_fields: - # get a list of all properties so that none are selected + # Get a list of all properties so that none are selected non_selected_properties = schema.get('annotated-schema', {}).get( 'properties', {}).keys() @@ -372,13 +399,10 @@ def timedelta_formatted(self, dtime, days=0): def is_incremental(self, stream): return self.expected_metadata()[stream][self.REPLICATION_METHOD] == self.INCREMENTAL - def is_full_table_sub_stream(self, stream): - return stream in self.FULL_TABLE_SUB_STREAMS + def is_incremental_sub_stream(self, stream): + return stream in self.INCREMENTAL_SUB_STREAMS - def dt_to_ts(self, dtime): - for date_format in self.DATETIME_FMT: - try: - date_stripped = int(time.mktime(dt.strptime(dtime, date_format).timetuple())) - return date_stripped - except ValueError: - continue + def dt_to_ts(self, dtime, format): + """Convert datetime with a format to timestamp""" + date_stripped = int(time.mktime(dt.strptime(dtime, format).timetuple())) + return date_stripped diff --git a/tests/test_github_all_fields.py b/tests/test_github_all_fields.py index 17173dc1..305a9151 100644 --- a/tests/test_github_all_fields.py +++ b/tests/test_github_all_fields.py @@ -4,6 +4,106 @@ from base import TestGithubBase +# As we are not able to generate the following fields by Github UI, so removed them from the expectation list. +KNOWN_MISSING_FIELDS = { + 'events': { + 'ref', + 'head', + 'push_id', + 'distinct_size', + 'size' + }, + 'project_cards': { + 'name', + 'cards_url', + 'column_name', + 'project_id' + }, + 'commits': { + 'files', + 'pr_id', + 'id', + 'pr_number', + 'stats', + }, + 'pr_commits': { + 'files', + 'stats' + }, + 'review_comments': { + 'assignees', + 'commits_url', + 'diff_url', + 'head', + 'review_comments_url', + 'comments_url', + 'issue_url', + 'assignee', + 'requested_teams', + 'patch_url', + 'milestone', + 'review_comment_url', + 'statuses_url', + 'requested_reviewers', + 'labels', + 'base', + 'merge_commit_sha', + 'locked', + 'body_text', + 'body_html' + }, + 'comments': { + 'home_url', + 'body_text', + 'body_html' + }, + 'team_members': { + 'email', + 'starred_at', + 'name', + }, + 'issues': { + 'body_text', + 'closed_by', + 'body_html' + }, + 'releases': { + 'discussion_url', + 'body_html', + 'body_text', + 'mentions_count', + 'reactions' + }, + 'collaborators': { + 'email', + 'name' + }, + 'reviews': { + 'body_text', + 'body_html' + }, + 'teams': { + 'permissions' + }, + 'projects': { + 'organization_permission', + 'private' + }, + 'assignees': { + 'email', + 'starred_at', + 'name' + }, + 'pull_requests': { + 'issues_url' + }, + 'issue_events': { + 'dismissed_review', + 'requested_team', + 'author_association', + 'draft' + }, +} class TestGithubAllFields(TestGithubBase): """Test that with all fields selected for a stream automatic and available fields are replicated""" @@ -14,43 +114,26 @@ def name(): def test_run(self): """ - Ensure running the tap with all streams and fields selected results in the - replication of all fields. - - Verify no unexpected streams were replicated - - Verify that more than just the automatic fields are replicated for each stream. + • Verify no unexpected streams were replicated + • Verify that more than just the automatic fields are replicated for each stream. + • Verify all fields for each stream are replicated """ - # BUG TDL-16672 - # The excluded streams are not honoring all fields selection - excluded_streams = { - 'issue_events', - 'comments', - 'projects', - 'pr_commits', - 'events', - 'review_comments', - 'issues', - 'project_cards', - 'project_columns', - 'commits', - 'collaborators' - } - - expected_streams = self.expected_streams() - excluded_streams - - # instantiate connection + + expected_streams = self.expected_streams() + # Instantiate connection conn_id = connections.ensure_connection(self) - # run check mode + # Run check mode found_catalogs = self.run_and_verify_check_mode(conn_id) - # table and field selection + # Table and field selection test_catalogs_all_fields = [catalog for catalog in found_catalogs if catalog.get('stream_name') in expected_streams] self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs_all_fields, select_all_fields=True, ) - # grab metadata after performing table-and-field selection to set expectations + # Grab metadata after performing table-and-field selection to set expectations stream_to_all_catalog_fields = dict() # used for asserting all fields are replicated for catalog in test_catalogs_all_fields: stream_id, stream_name = catalog['stream_id'], catalog['stream_name'] @@ -60,7 +143,7 @@ def test_run(self): if md_entry['breadcrumb'] != []] stream_to_all_catalog_fields[stream_name] = set(fields_from_field_level_md) - # run initial sync + # Run initial sync record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() @@ -70,21 +153,22 @@ def test_run(self): for stream in expected_streams: with self.subTest(stream=stream): - # expected values - expected_automatic_keys = self.expected_primary_keys().get(stream) + # Expected values + expected_automatic_keys = self.expected_automatic_keys().get(stream) - # get all expected keys + # Get all expected keys expected_all_keys = stream_to_all_catalog_fields[stream] - # collect actual values messages = synced_records.get(stream) - actual_all_keys = [set(message['data'].keys()) for message in messages['messages'] - if message['action'] == 'upsert'][0] - - # Verify that you get some records for each stream - self.assertGreater(record_count_by_stream.get(stream, -1), 0) - - # verify all fields for a stream were replicated + # Collect actual values + actual_all_keys = set() + for message in messages['messages']: + if message['action'] == 'upsert': + actual_all_keys.update(message['data'].keys()) + + expected_all_keys = expected_all_keys - KNOWN_MISSING_FIELDS.get(stream, set()) + + # Verify all fields for a stream were replicated self.assertGreater(len(expected_all_keys), len(expected_automatic_keys)) self.assertTrue(expected_automatic_keys.issubset(expected_all_keys), msg=f'{expected_automatic_keys-expected_all_keys} is not in "expected_all_keys"') self.assertSetEqual(expected_all_keys, actual_all_keys) diff --git a/tests/test_github_automatic_fields.py b/tests/test_github_automatic_fields.py index 7a5bc759..35b0de56 100644 --- a/tests/test_github_automatic_fields.py +++ b/tests/test_github_automatic_fields.py @@ -1,6 +1,3 @@ -""" -Test that with no fields selected for a stream automatic fields are still replicated -""" from tap_tester import runner, connections from base import TestGithubBase @@ -15,21 +12,19 @@ def name(): def test_run(self): """ - - Verify that for each stream you can get multiple pages of data - when no fields are selected. - - Verify that only the automatic fields are sent to the target. - - Verify that all replicated records have unique primary key values. + • Verify we can deselect all fields except when inclusion=automatic, which is handled by base.py methods + • Verify that only the automatic fields are sent to the target. + • Verify that all replicated records have unique primary key values. """ - # Exclude collaborators stream due to access issues in circle - expected_streams = self.expected_streams() - {'collaborators'} + expected_streams = self.expected_streams() - # instantiate connection + # Instantiate connection conn_id = connections.ensure_connection(self) - # run check mode + # Run check mode found_catalogs = self.run_and_verify_check_mode(conn_id) - # table and field selection + # Table and field selection test_catalogs_automatic_fields = [catalog for catalog in found_catalogs if catalog.get('stream_name') in expected_streams] @@ -37,20 +32,22 @@ def test_run(self): conn_id, test_catalogs_automatic_fields, select_all_fields=False, ) - # run initial sync + # Run initial sync record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): - # expected values - expected_keys = self.expected_primary_keys().get(stream) + + # Expected values + expected_primary_keys = self.expected_primary_keys()[stream] + expected_keys = self.expected_automatic_keys().get(stream) - # collect actual values + # Collect actual values data = synced_records.get(stream, {}) record_messages_keys = [set(row.get('data').keys()) for row in data.get('messages', {})] primary_keys_list = [ - tuple(message.get('data').get(expected_pk) for expected_pk in expected_keys) + tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) for message in data.get('messages') if message.get('action') == 'upsert'] unique_primary_keys_list = set(primary_keys_list) diff --git a/tests/test_github_bookmarks.py b/tests/test_github_bookmarks.py index 3520a9d8..9e2c4135 100644 --- a/tests/test_github_bookmarks.py +++ b/tests/test_github_bookmarks.py @@ -8,77 +8,54 @@ class TestGithubBookmarks(TestGithubBase): + """Test tap sets a bookmark and respects it for the next sync of a stream""" + @staticmethod def name(): return "tap_tester_github_bookmarks" - @staticmethod - def convert_state_to_utc(date_str): - """ - Convert a saved bookmark value of the form '2020-08-25T13:17:36-07:00' to - a string formatted utc datetime, - in order to compare against json formatted datetime values - """ - date_object = dateutil.parser.parse(date_str) - date_object_utc = date_object.astimezone(tz=pytz.UTC) - return datetime.datetime.strftime(date_object_utc, "%Y-%m-%dT%H:%M:%SZ") - def calculated_states_by_stream(self, current_state, synced_records, replication_keys): """ Look at the bookmarks from a previous sync and set a new bookmark value based off timedelta expectations. This ensures the subsequent sync will replicate at least 1 record but, fewer records than the previous sync. - - If the test data is changed in the future this will break expectations for this test. """ timedelta_by_stream = {stream: [90,0,0] # {stream_name: [days, hours, minutes], ...} for stream in self.expected_streams()} - timedelta_by_stream['comments'] = [7, 0, 0] - timedelta_by_stream['commit_comments'] = [0, 0, 1] - timedelta_by_stream['commits'] = [0, 17, 0] - timedelta_by_stream['issue_events'] = [1, 0, 0] - timedelta_by_stream['issue_milestones'] = [0, 1, 0] - timedelta_by_stream['issues'] = [7, 0, 0] - timedelta_by_stream['pull_requests'] = [7, 0, 0] repo = self.get_properties().get('repository') - stream_to_calculated_state = {stream: "" for stream in current_state['bookmarks'][repo].keys()} + stream_to_calculated_state = {repo: {stream: "" for stream in current_state['bookmarks'][repo].keys()}} for stream, state in current_state['bookmarks'][repo].items(): state_key, state_value = next(iter(state.keys())), next(iter(state.values())) - sync_messages = [record.get('data') for record in - synced_records.get(stream, {'messages': []}).get('messages') - if record.get('action') == 'upsert'] - - # the `commits` and `pr_commits` streams don't have a top level replication_key field - if stream in ('commits', 'pr_commits'): - max_record_values = [values.get('commit', {}).get('committer', {}).get('date') - for values in sync_messages] - max_value = max(max_record_values) - else: - replication_key = next(iter(replication_keys.get(stream))) - max_record_values = [values.get(replication_key) for values in sync_messages] - max_value = max(max_record_values) - - # this is because the tap uses `time_extracted` to bookmark with `since` at execution - new_state_value = min(max_value, state_value) - state_as_datetime = dateutil.parser.parse(new_state_value) + state_as_datetime = dateutil.parser.parse(state_value) days, hours, minutes = timedelta_by_stream[stream] calculated_state_as_datetime = state_as_datetime - datetime.timedelta(days=days, hours=hours, minutes=minutes) - state_format = '%Y-%m-%dT%H:%M:%S-00:00' + state_format = '%Y-%m-%dT%H:%M:%SZ' calculated_state_formatted = datetime.datetime.strftime(calculated_state_as_datetime, state_format) - stream_to_calculated_state[stream] = {state_key: calculated_state_formatted} + stream_to_calculated_state[repo][stream] = {state_key: calculated_state_formatted} return stream_to_calculated_state def test_run(self): - # Exclude collaborators stream due to access issues in circle - expected_streams = self.expected_streams() - {'collaborators'} + """ + • Verify that for each stream you can do a sync which records bookmarks. + • Verify that the bookmark is the maximum value sent to the target for the replication key. + • Verify that a second sync respects the bookmark + All data of the second sync is >= the bookmark from the first sync + The number of records in the 2nd sync is less then the first + • Verify that for full table stream, all data replicated in sync 1 is replicated again in sync 2. + + PREREQUISITE + For EACH stream that is incrementally replicated there are multiple rows of data with + different values for the replication key + """ + expected_streams = self.expected_streams() expected_replication_keys = self.expected_bookmark_keys() expected_replication_methods = self.expected_replication_method() @@ -109,8 +86,8 @@ def test_run(self): new_states = {'bookmarks': dict()} simulated_states = self.calculated_states_by_stream(first_sync_bookmarks, first_sync_records, expected_replication_keys) - for stream, new_state in simulated_states.items(): - new_states['bookmarks'][stream] = new_state + for repo, new_state in simulated_states.items(): + new_states['bookmarks'][repo] = new_state menagerie.set_state(conn_id, new_states) ########################################################################## @@ -128,10 +105,10 @@ def test_run(self): for stream in expected_streams: with self.subTest(stream=stream): - # expected values + # Expected values expected_replication_method = expected_replication_methods[stream] - # collect information for assertions from syncs 1 & 2 base on expected values + # Collect information for assertions from syncs 1 & 2 base on expected values first_sync_count = first_sync_record_count.get(stream, 0) second_sync_count = second_sync_record_count.get(stream, 0) first_sync_messages = [record.get('data') for record in @@ -145,12 +122,15 @@ def test_run(self): if expected_replication_method == self.INCREMENTAL: - # collect information specific to incremental streams from syncs 1 & 2 + # Collect information specific to incremental streams from syncs 1 & 2 replication_key = next(iter(expected_replication_keys[stream])) first_bookmark_value = first_bookmark_key_value.get('since') second_bookmark_value = second_bookmark_key_value.get('since') - first_bookmark_value_utc = self.convert_state_to_utc(first_bookmark_value) - second_bookmark_value_utc = self.convert_state_to_utc(second_bookmark_value) + + first_bookmark_value_ts = self.dt_to_ts(first_bookmark_value, self.BOOKMARK_FORMAT) + second_bookmark_value_ts = self.dt_to_ts(second_bookmark_value, self.BOOKMARK_FORMAT) + + simulated_bookmark_value = self.dt_to_ts(new_states['bookmarks'][repo][stream]['since'], self.BOOKMARK_FORMAT) # Verify the first sync sets a bookmark of the expected form self.assertIsNotNone(first_bookmark_key_value) @@ -161,31 +141,34 @@ def test_run(self): self.assertIsNotNone(second_bookmark_key_value.get('since')) # Verify the second sync bookmark is Equal or Greater than the first sync bookmark - # the tap uses `time_extracted` and sets a bookmark using `since` for all real/pseudo incremental streams - self.assertGreaterEqual(second_bookmark_value, first_bookmark_value) - - for record in second_sync_messages: - # Verify the second sync bookmark value is the max replication key value for a given stream - if stream in ('commits', 'pr_commits'): - replication_key_value = record.get('commit', {}).get('committer', {}).get('date') - else: - replication_key_value = record.get(replication_key) - self.assertLessEqual( - replication_key_value, second_bookmark_value_utc, - msg="Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced." - ) + self.assertGreaterEqual(second_bookmark_value_ts, first_bookmark_value_ts) + replication_key_format = self.RECORD_REPLICATION_KEY_FORMAT + # For events stream replication key value is coming in different format + if stream == 'events': + replication_key_format = self.EVENTS_RECORD_REPLICATION_KEY_FORMAT + for record in first_sync_messages: # Verify the first sync bookmark value is the max replication key value for a given stream - if stream in ('commits', 'pr_commits'): - replication_key_value = record.get('commit', {}).get('committer', {}).get('date') - else: - replication_key_value = record.get(replication_key) + replication_key_value = self.dt_to_ts(record.get(replication_key), replication_key_format) + self.assertLessEqual( - replication_key_value, first_bookmark_value_utc, + replication_key_value, first_bookmark_value_ts, msg="First sync bookmark was set incorrectly, a record with a greater replication-key value was synced." ) + for record in second_sync_messages: + # Verify the second sync bookmark value is the max replication key value for a given stream + replication_key_value = self.dt_to_ts(record.get(replication_key), replication_key_format) + + self.assertGreaterEqual(replication_key_value, simulated_bookmark_value, + msg="Second sync records do not respect the previous bookmark.") + + self.assertLessEqual( + replication_key_value, second_bookmark_value_ts, + msg="Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced." + ) + # Verify the number of records in the 2nd sync is less then the first self.assertLessEqual(second_sync_count, first_sync_count) diff --git a/tests/test_github_discovery.py b/tests/test_github_discovery.py index 3d4c13f6..1fff7f0d 100644 --- a/tests/test_github_discovery.py +++ b/tests/test_github_discovery.py @@ -23,7 +23,7 @@ def test_run(self): • verify that primary keys are given the inclusion of automatic. • verify that all other fields have inclusion of available metadata. """ - streams_to_test = self.expected_streams() + expected_streams = self.expected_streams() conn_id = connections.ensure_connection(self) @@ -34,7 +34,7 @@ def test_run(self): self.assertTrue(all([re.fullmatch(r"[a-z_]+", name) for name in found_catalog_names]), msg="One or more streams don't follow standard naming") - for stream in streams_to_test: + for stream in expected_streams: with self.subTest(stream=stream): # Verify ensure the catalog is found for a given stream @@ -42,14 +42,15 @@ def test_run(self): if catalog["stream_name"] == stream])) self.assertIsNotNone(catalog) - # collecting expected values + # Collecting expected values expected_primary_keys = self.expected_primary_keys()[stream] - expected_automatic_fields = expected_primary_keys + expected_automatic_keys = self.expected_automatic_keys().get(stream) - # collecting actual values... + # Collecting actual values... schema_and_metadata = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) metadata = schema_and_metadata["metadata"] stream_properties = [item for item in metadata if item.get("breadcrumb") == []] + actual_fields = [md_entry.get("breadcrumb")[1] for md_entry in metadata if md_entry.get("breadcrumb") != []] actual_primary_keys = set( stream_properties[0].get( "metadata", {self.PRIMARY_KEYS: []}).get(self.PRIMARY_KEYS, []) @@ -60,24 +61,40 @@ def test_run(self): if item.get("metadata").get("inclusion") == "automatic" ) + actual_replication_method = stream_properties[0].get( + "metadata", {self.REPLICATION_METHOD: None}).get(self.REPLICATION_METHOD) + ########################################################################## ### metadata assertions ########################################################################## - # verify there is only 1 top level breadcrumb in metadata + # Verify there is only 1 top level breadcrumb in metadata self.assertTrue(len(stream_properties) == 1, msg="There is NOT only one top level breadcrumb for {}".format(stream) + \ "\nstream_properties | {}".format(stream_properties)) - # verify primary key(s) match expectations + # Verify there are no duplicate metadata entries + self.assertEqual(len(actual_fields), + len(set(actual_fields)), + msg = "duplication in the retrieved fields") + + # Verify primary key(s) match expectations self.assertSetEqual( expected_primary_keys, actual_primary_keys, ) - # verify that primary keys are given the inclusion of automatic in metadata. - self.assertSetEqual(expected_automatic_fields, actual_automatic_fields) - - # verify that all other fields have inclusion of available + # Verify that primary keys and replication keys are given the inclusion of automatic in metadata. + self.assertSetEqual(expected_automatic_keys, actual_automatic_fields) + + # Verify the actual replication matches our expected replication method + self.assertEqual( + self.expected_replication_method().get(stream, None), + actual_replication_method, + msg="The actual replication method {} doesn't match the expected {}".format( + actual_replication_method, + self.expected_replication_method().get(stream, None))) + + # Verify that all other fields have inclusion of available # This assumes there are no unsupported fields for SaaS sources self.assertTrue( all({item.get("metadata").get("inclusion") == "available" diff --git a/tests/test_github_interrupted_sync.py b/tests/test_github_interrupted_sync.py new file mode 100644 index 00000000..7c268604 --- /dev/null +++ b/tests/test_github_interrupted_sync.py @@ -0,0 +1,172 @@ +from tap_tester import connections, runner, menagerie +from base import TestGithubBase + + +class TestGithubInterruptedSync(TestGithubBase): + """Test tap's ability to recover from an interrupted sync""" + + @staticmethod + def name(): + return "tt_github_interrupted_sync_test" + + def get_properties(self): + """ + Maintain states for start_date and end_date + """ + return { + 'start_date' : '2021-10-01T00:00:00Z', + 'repository': 'singer-io/test-repo singer-io/singer-python' + } + + def test_run(self): + """ + Testing that if a sync job is interrupted and state is saved with `currently_syncing`(stream) and `currently_syncing_repo`, + the next sync job kicks off and the tap picks back up on that `currently_syncing` stream of `currently_syncing_repo`. + """ + streams_to_test = {"issues", "stargazers", "pull_requests", "issue_events"} + conn_id = connections.ensure_connection(self) + expected_replication_methods = self.expected_replication_method() + expected_replication_keys = self.expected_bookmark_keys() + repo_key = "_sdc_repository" + + start_date = self.dt_to_ts(self.get_properties().get("start_date"), self.BOOKMARK_FORMAT) + + # Run a discovery job + found_catalogs = self.run_and_verify_check_mode(conn_id) + + # Partition catalogs for use in table/field selection + test_catalogs = [catalog for catalog in found_catalogs + if catalog.get('stream_name') in streams_to_test] + self.perform_and_verify_table_and_field_selection(conn_id, test_catalogs, select_all_fields=True) + + # Run a sync + self.run_and_verify_sync(conn_id) + + # Acquire records from the target output + full_sync_records = runner.get_records_from_target_output() + full_sync_state = menagerie.get_state(conn_id) + + # Set state in which all streams of one repo(singer-io/singer-python) have completed a sync. + # And one stream (pull_requests) of other repo(singer-io/test-repo) is syncing currently. + + interrupted_state = { + "currently_syncing": "pull_requests", + "currently_syncing_repo": "singer-io/test-repo", + "bookmarks": { + "singer-io/singer-python": { + "issues": { + "since": "2022-06-22T13:32:42Z" + }, + "pull_requests": { + "since": "2022-06-22T13:32:42Z" + }, + "issue_events": { + "since": "2022-06-22T13:32:42Z" + } + }, + "singer-io/test-repo": { + "issues": { + "since": "2022-07-13T09:21:19Z" + }, + "pull_requests": { + "since": "2022-06-30T05:33:24Z" + } + } + } + } + + menagerie.set_state(conn_id, interrupted_state) + + # Run another sync + self.run_and_verify_sync(conn_id) + + # acquire records from target output + interrupted_sync_records = runner.get_records_from_target_output() + final_state = menagerie.get_state(conn_id) + currently_syncing = final_state.get('currently_syncing') + + # Checking resuming sync resulted in a successfully saved state + with self.subTest(): + + # Verify sync is not interrupted by checking currently_syncing in the state for sync + self.assertIsNone(currently_syncing) + + # Verify bookmarks are saved + self.assertIsNotNone(final_state.get('bookmarks')) + + # Verify final_state is equal to uninterrupted sync's state + # (This is what the value would have been without an interruption and proves resuming succeeds) + self.assertDictEqual(final_state, full_sync_state) + + for repository in self.get_properties().get("repository").split(): + with self.subTest(repository=repository): + + full_sync_bookmark = full_sync_state["bookmarks"][repository] + final_bookmark = final_state["bookmarks"][repository] + interrupted_repo_bookmark = interrupted_state["bookmarks"][repository] + + for stream in streams_to_test: + with self.subTest(stream=stream): + + # Expected values + expected_replication_method = expected_replication_methods[stream] + expected_primary_keys = list(self.expected_primary_keys()[stream]) + + # Gather results + full_records = [message['data'] for message in + full_sync_records.get(stream, {}).get('messages', []) + if message['data'][repo_key] == repository] + full_record_count = len(full_records) + + interrupted_records = [message['data'] for message in + interrupted_sync_records.get(stream, {}).get('messages', []) + if message['data'][repo_key] == repository] + interrupted_record_count = len(interrupted_records) + + if expected_replication_method == self.INCREMENTAL: + expected_replication_key = next(iter(expected_replication_keys[stream])) + + if stream in interrupted_repo_bookmark.keys(): + interrupted_bookmark = self.dt_to_ts(interrupted_repo_bookmark[stream]["since"], self.BOOKMARK_FORMAT) + + if stream == interrupted_state['currently_syncing'] and repository == interrupted_state['currently_syncing_repo']: + + for record in interrupted_records: + rec_time = self.dt_to_ts(record[expected_replication_key], self.RECORD_REPLICATION_KEY_FORMAT) + self.assertGreaterEqual(rec_time, interrupted_bookmark) + + # Verify all interrupted recs are in full recs + self.assertIn(record, full_records, msg='incremental table record in interrupted sync not found in full sync') + + # Record count for all streams of interrupted sync match expectations + full_records_after_interrupted_bookmark = 0 + + for record in full_records: + rec_time = self.dt_to_ts(record[expected_replication_key], self.RECORD_REPLICATION_KEY_FORMAT) + self.assertGreaterEqual(rec_time, start_date) + + if (rec_time >= interrupted_bookmark): + full_records_after_interrupted_bookmark += 1 + + self.assertEqual(full_records_after_interrupted_bookmark, len(interrupted_records), \ + msg="Expected {} records in each sync".format(full_records_after_interrupted_bookmark)) + else: + # Verify we collected records that have the same replication value as a bookmark for streams that are already synced + self.assertGreaterEqual(interrupted_record_count, 0) + else: + # Verify resuming sync replicates all records that were found in the full sync (uninterrupted) + for record in interrupted_records: + with self.subTest(record_primary_key=record[expected_primary_keys[0]]): + self.assertIn(record, full_records, msg='Unexpected record replicated in resuming sync.') + for record in full_records: + with self.subTest(record_primary_key=record[expected_primary_keys[0]]): + self.assertIn(record, interrupted_records, msg='Record missing from resuming sync.' ) + else: + # Verify full table streams do not save bookmarked values at the conclusion of a successful sync + self.assertNotIn(stream, full_sync_bookmark.keys()) + self.assertNotIn(stream, final_bookmark.keys()) + + # Verify first and second sync have the same records + self.assertEqual(full_record_count, interrupted_record_count) + for rec in interrupted_records: + self.assertIn(rec, full_records, msg='full table record in interrupted sync not found in full sync') diff --git a/tests/test_github_interrupted_sync_add_stream.py b/tests/test_github_interrupted_sync_add_stream.py new file mode 100644 index 00000000..0b46d389 --- /dev/null +++ b/tests/test_github_interrupted_sync_add_stream.py @@ -0,0 +1,177 @@ +from tap_tester import connections, runner, menagerie +from base import TestGithubBase + + +class TestGithubInterruptedSyncAddStream(TestGithubBase): + """Test tap's ability to recover from an interrupted sync""" + + @staticmethod + def name(): + return "tt_github_interrupted_sync_add_stream_test" + + def get_properties(self): + """ + Maintain states for start_date and end_date + """ + return { + 'start_date' : '2021-10-01T00:00:00Z', + 'repository': 'singer-io/test-repo singer-io/singer-python' + } + + def test_run(self): + """ + Testing that if a sync job is interrupted and state is saved with `currently_syncing`(stream) and `currently_syncing_repo`, + the next sync job kicks off and the tap picks back up on that `currently_syncing` stream of `currently_syncing_repo`. + - Verify behavior is consistent when an added stream is selected between initial and resuming sync + """ + streams_to_test = {"issues", "stargazers", "pull_requests"} + conn_id = connections.ensure_connection(self) + expected_replication_methods = self.expected_replication_method() + expected_replication_keys = self.expected_bookmark_keys() + repo_key = "_sdc_repository" + + start_date = self.dt_to_ts(self.get_properties().get("start_date"), self.BOOKMARK_FORMAT) + + # Run a discovery job + found_catalogs = self.run_and_verify_check_mode(conn_id) + + # Partition catalogs for use in table/field selection + test_catalogs = [catalog for catalog in found_catalogs + if catalog.get('stream_name') in streams_to_test] + self.perform_and_verify_table_and_field_selection(conn_id, test_catalogs, select_all_fields=True) + + # Run a sync + self.run_and_verify_sync(conn_id) + + # Acquire records from the target output + full_sync_records = runner.get_records_from_target_output() + full_sync_state = menagerie.get_state(conn_id) + + # Add a stream between syncs + added_stream = 'issue_events' + streams_to_test.add(added_stream) + test_catalogs = [catalog for catalog in found_catalogs + if catalog.get('stream_name') in streams_to_test] + # Add new stream to selected list + self.perform_and_verify_table_and_field_selection(conn_id, test_catalogs, select_all_fields=True) + + # Set state in which all streams of one repo(singer-io/singer-python) have completed a sync. + # And one stream (pull_requests) of other repo(singer-io/test-repo) is syncing currently. + + interrupted_state = { + "currently_syncing": "pull_requests", + "currently_syncing_repo": "singer-io/test-repo", + "bookmarks": { + "singer-io/singer-python": { + "issues": { + "since": "2022-06-22T13:32:42Z" + }, + "pull_requests": { + "since": "2022-06-22T13:32:42Z" + } + }, + "singer-io/test-repo": { + "issues": { + "since": "2022-07-14T07:47:21Z" + }, + "pull_requests": { + "since": "2022-07-13T07:47:21Z" + } + } + } + } + + menagerie.set_state(conn_id, interrupted_state) + + # Run another sync + self.run_and_verify_sync(conn_id) + + # acquire records from target output + interrupted_sync_records = runner.get_records_from_target_output() + final_state = menagerie.get_state(conn_id) + currently_syncing = final_state.get('currently_syncing') + + # Checking resuming sync resulted in a successfully saved state + with self.subTest(): + + # Verify sync is not interrupted by checking currently_syncing in the state for sync + self.assertIsNone(currently_syncing) + + # Verify bookmarks are saved + self.assertIsNotNone(final_state.get('bookmarks')) + + for repository in self.get_properties().get("repository").split(): + with self.subTest(repository=repository): + + full_sync_bookmark = full_sync_state["bookmarks"][repository] + final_bookmark = final_state["bookmarks"][repository] + interrupted_repo_bookmark = interrupted_state["bookmarks"][repository] + + for stream in streams_to_test: + with self.subTest(stream=stream): + + # Expected values + expected_replication_method = expected_replication_methods[stream] + + # Gather results + if stream != added_stream: + full_records = [message['data'] for message in + full_sync_records.get(stream, {}).get('messages', []) + if message['data'][repo_key] == repository] + full_record_count = len(full_records) + + interrupted_records = [message['data'] for message in + interrupted_sync_records.get(stream, {}).get('messages', []) + if message['data'][repo_key] == repository] + interrupted_record_count = len(interrupted_records) + + if expected_replication_method == self.INCREMENTAL: + expected_replication_key = next(iter(expected_replication_keys[stream])) + + if stream in full_sync_bookmark.keys(): + full_sync_stream_bookmark = self.dt_to_ts(full_sync_bookmark.get(stream, {}).get("since"), self.BOOKMARK_FORMAT) + final_sync_stream_bookmark = self.dt_to_ts(final_bookmark.get(stream, {}).get("since"), self.BOOKMARK_FORMAT) + + if stream in interrupted_repo_bookmark.keys(): + interrupted_bookmark = self.dt_to_ts(interrupted_repo_bookmark[stream]["since"], self.BOOKMARK_FORMAT) + + for record in interrupted_records: + rec_time = self.dt_to_ts(record[expected_replication_key], self.RECORD_REPLICATION_KEY_FORMAT) + self.assertGreaterEqual(rec_time, interrupted_bookmark) + + else: + # verify we collected records that have the same replication value as a bookmark for streams that are already synced + self.assertGreater(interrupted_record_count, 0) + + if stream != added_stream: + + # Verify state ends with the same value for common streams after both full and interrupted syncs + self.assertEqual(full_sync_stream_bookmark, final_sync_stream_bookmark) + + for record in interrupted_records: + + # Verify all interrupted recs are in full recs + self.assertIn(record, full_records, msg='incremental table record in interrupted sync not found in full sync') + + # Record count for all streams of interrupted sync match expectations + full_records_after_interrupted_bookmark = 0 + + for record in full_records: + rec_time = self.dt_to_ts(record[expected_replication_key], self.RECORD_REPLICATION_KEY_FORMAT) + self.assertGreater(rec_time, start_date, msg=f"{expected_replication_key} {stream} {repository} {record}") + + if (rec_time >= interrupted_bookmark): + full_records_after_interrupted_bookmark += 1 + + self.assertGreaterEqual(full_records_after_interrupted_bookmark, interrupted_record_count, \ + msg="Expected max {} records in each sync".format(full_records_after_interrupted_bookmark)) + + else: + # Verify full table streams do not save bookmarked values after a successful sync + self.assertNotIn(stream, full_sync_bookmark.keys()) + self.assertNotIn(stream, final_bookmark.keys()) + + # Verify first and second sync have the same records + self.assertEqual(full_record_count, interrupted_record_count) + for rec in interrupted_records: + self.assertIn(rec, full_records, msg='full table record in interrupted sync not found in full sync') diff --git a/tests/test_github_interrupted_sync_remove_stream.py b/tests/test_github_interrupted_sync_remove_stream.py new file mode 100644 index 00000000..04ed54d6 --- /dev/null +++ b/tests/test_github_interrupted_sync_remove_stream.py @@ -0,0 +1,202 @@ +from tap_tester import connections, runner, menagerie +from base import TestGithubBase + + +class TestGithubInterruptedSyncRemoveStream(TestGithubBase): + """Test tap's ability to recover from an interrupted sync""" + + @staticmethod + def name(): + return "tt_github_interrupted_sync_remove_stream_test" + + def get_properties(self): + """ + Maintain states for start_date and end_date + """ + return { + 'start_date' : '2021-10-01T00:00:00Z', + 'repository': 'singer-io/test-repo singer-io/singer-python' + } + + def test_run(self): + + # Test for removing any stream from state + self.run_interrupted_sync("issue_events") + + # Test for removing currently syncing stream from state + self.run_interrupted_sync("pull_requests") + + def run_interrupted_sync(self, removed_stream): + """ + Testing that if a sync job is interrupted and state is saved with `currently_syncing`(stream) and `currently_syncing_repo`, + the next sync job kicks off and the tap picks back up on that `currently_syncing` stream of `currently_syncing_repo`. + - Verify behavior is consistent when a stream is removed from the selected list between initial and resuming sync. + """ + streams_to_test = {"issues", "stargazers", "pull_requests", "issue_events"} + conn_id = connections.ensure_connection(self) + expected_replication_methods = self.expected_replication_method() + expected_replication_keys = self.expected_bookmark_keys() + repo_key = "_sdc_repository" + + start_date = self.dt_to_ts(self.get_properties().get("start_date"), self.BOOKMARK_FORMAT) + + # Run a discovery job + found_catalogs = self.run_and_verify_check_mode(conn_id) + + # Partition catalogs for use in table/field selection + test_catalogs = [catalog for catalog in found_catalogs + if catalog.get('stream_name') in streams_to_test] + self.perform_and_verify_table_and_field_selection(conn_id, test_catalogs, select_all_fields=True) + + # Run a sync + self.run_and_verify_sync(conn_id) + + # Acquire records from target output + full_sync_records = runner.get_records_from_target_output() + full_sync_state = menagerie.get_state(conn_id) + + # Create new connection for another sync + conn_id_2 = connections.ensure_connection(self) + + # Add a stream between syncs + streams_to_test = streams_to_test - {removed_stream} + found_catalogs = self.run_and_verify_check_mode(conn_id_2) + + test_catalogs = [catalog for catalog in found_catalogs + if catalog.get('stream_name') in streams_to_test] + + # Add new stream to selected list + self.perform_and_verify_table_and_field_selection(conn_id_2, test_catalogs, select_all_fields=True) + + # Set state in which all streams of one repo(singer-io/singer-python) have completed a sync. + # And one stream (pull_requests) of other repo(singer-io/test-repo) is syncing currently. + + interrupted_state = { + "currently_syncing": "pull_requests", + "currently_syncing_repo": "singer-io/test-repo", + "bookmarks": { + "singer-io/singer-python": { + "issues": { + "since": "2022-06-22T13:32:42Z" + }, + "pull_requests": { + "since": "2022-06-22T13:32:42Z" + }, + "issue_events": { + "since": "2022-06-22T13:32:42Z" + } + }, + "singer-io/test-repo": { + "issues": { + "since": "2022-07-14T07:47:21Z" + }, + "pull_requests": { + "since": "2022-07-13T07:47:21Z" + } + } + } + } + + menagerie.set_state(conn_id_2, interrupted_state) + + # Run another sync + self.run_and_verify_sync(conn_id_2) + + # Acquire records from target output + interrupted_sync_records = runner.get_records_from_target_output() + final_state = menagerie.get_state(conn_id_2) + currently_syncing = final_state.get('currently_syncing') + + # Checking resuming sync resulted in a successfully saved state + with self.subTest(): + + # Verify sync is not interrupted by checking currently_syncing in the state for sync + self.assertIsNone(currently_syncing) + + # Verify bookmarks are saved + self.assertIsNotNone(final_state.get('bookmarks')) + + for repository in self.get_properties().get("repository").split(): + with self.subTest(repository=repository): + + full_sync_bookmark = full_sync_state["bookmarks"][repository] + final_bookmark = final_state["bookmarks"][repository] + interrupted_repo_bookmark = interrupted_state["bookmarks"][repository] + + for stream in list(streams_to_test) + [removed_stream]: + with self.subTest(stream=stream): + + # Expected values + expected_replication_method = expected_replication_methods[stream] + expected_primary_keys = list(self.expected_primary_keys()[stream]) + + # Gather results + full_records = [message['data'] for message in + full_sync_records.get(stream, {}).get('messages', []) + if message['data'][repo_key] == repository] + full_record_count = len(full_records) + + if stream != removed_stream: + interrupted_records = [message['data'] for message in + interrupted_sync_records.get(stream, {}).get('messages', []) + if message['data'][repo_key] == repository] + interrupted_record_count = len(interrupted_records) + else: + self.assertNotIn(stream, interrupted_sync_records.keys()) + + if expected_replication_method == self.INCREMENTAL: + expected_replication_key = next(iter(expected_replication_keys[stream])) + full_sync_stream_bookmark = self.dt_to_ts(full_sync_bookmark.get(stream, {}).get("since"), self.BOOKMARK_FORMAT) + + if stream in interrupted_repo_bookmark.keys(): + interrupted_bookmark = self.dt_to_ts(interrupted_repo_bookmark[stream]["since"], self.BOOKMARK_FORMAT) + final_sync_stream_bookmark = self.dt_to_ts(final_bookmark.get(stream, {}).get("since"), self.BOOKMARK_FORMAT) + + if stream != removed_stream: + + # Verify state ends with the same value for common streams after both full and interrupted syncs + self.assertEqual(full_sync_stream_bookmark, final_sync_stream_bookmark) + + # Verify resuming sync only replicates records with replication key values greater or equal to + # the interrupted_state for streams that were completed, replicated during the interrupted sync. + for record in interrupted_records: + with self.subTest(record_primary_key=record[expected_primary_keys[0]]): + rec_time = self.dt_to_ts(record[expected_replication_key], self.RECORD_REPLICATION_KEY_FORMAT) + self.assertGreaterEqual(rec_time, interrupted_bookmark) + + # Verify all interrupted recs are in full recs + self.assertIn(record, full_records, msg='Incremental table record in interrupted sync not found in full sync') + + # Record count for all streams of interrupted sync match expectations + full_records_after_interrupted_bookmark = 0 + for record in full_records: + rec_time = self.dt_to_ts(record[expected_replication_key], self.RECORD_REPLICATION_KEY_FORMAT) + self.assertGreater(rec_time, start_date, msg=f"{expected_replication_key} {stream} {repository} {record}") + + if (rec_time >= interrupted_bookmark): + full_records_after_interrupted_bookmark += 1 + + self.assertGreaterEqual(full_records_after_interrupted_bookmark, interrupted_record_count, \ + msg="Expected max {} records in each sync".format(full_records_after_interrupted_bookmark)) + else: + # Verify the bookmark has not advanced for the removed stream + self.assertEqual(final_sync_stream_bookmark, interrupted_bookmark) + else: + # verify we collected records that have the same replication value as a bookmark for streams that are already synced + self.assertGreater(interrupted_record_count, 0) + + else: + # Verify full table streams do not save bookmarked values after a successful sync + self.assertNotIn(stream, full_sync_bookmark.keys()) + self.assertNotIn(stream, final_bookmark.keys()) + + # Verify first and second sync have the same records + self.assertEqual(full_record_count, interrupted_record_count) + for rec in interrupted_records: + self.assertIn(rec, full_records, msg='Full table record in interrupted sync not found in full sync') + + # Verify at least 1 record was replicated for each stream + if stream != removed_stream: + self.assertGreater(interrupted_record_count, 0) + + print(f"{stream} resumed sync records replicated: {interrupted_record_count}") diff --git a/tests/test_github_pagination.py b/tests/test_github_pagination.py index 6beed905..06a24abd 100644 --- a/tests/test_github_pagination.py +++ b/tests/test_github_pagination.py @@ -1,3 +1,5 @@ +from math import ceil + from tap_tester import runner, connections from base import TestGithubBase @@ -11,7 +13,7 @@ def name(): def get_properties(self, original: bool = True): return_value = { 'start_date' : '2020-01-01T00:00:00Z', - 'repository': 'singer-io/tap-github' + 'repository': self.repository_name } if original: return return_value @@ -21,15 +23,38 @@ def get_properties(self, original: bool = True): return return_value def test_run(self): - # page size for "pull_requests" + + streams_to_test = self.expected_streams() + + # Pagination is not supported for "team_memberships" by Github API. + # Skipping "teams" stream as it's RECORD count is <= 30. + untestable_streams = {'team_memberships', 'teams'} + + # For some streams RECORD count were not > 30 in same test-repo. + # So, separated streams on the basis of RECORD count. + self.repository_name = 'singer-io/tap-github' + expected_stream_1 = {'comments', 'stargazers', 'commits', 'pull_requests', 'reviews', 'review_comments', 'pr_commits', 'issues'} + self.run_test(expected_stream_1) + + self.repository_name = 'singer-io/test-repo' + expected_stream_2 = streams_to_test - expected_stream_1 - untestable_streams + self.run_test(expected_stream_2) + + def run_test(self, streams): + """ + • Verify that for each stream you can get multiple pages of data. + This requires we ensure more than 1 page of data exists at all times for any given stream. + • Verify by pks that the data replicated matches the data we expect. + """ + + # Page size for pagination supported streams page_size = 30 conn_id = connections.ensure_connection(self) - # Checking pagination for "pull_requests" stream - expected_streams = ["pull_requests"] + expected_streams = streams found_catalogs = self.run_and_verify_check_mode(conn_id) - # table and field selection + # Table and field selection test_catalogs = [catalog for catalog in found_catalogs if catalog.get('stream_name') in expected_streams] @@ -39,27 +64,42 @@ def test_run(self): synced_records = runner.get_records_from_target_output() + # Verify no unexpected streams were replicated + synced_stream_names = set(synced_records.keys()) + self.assertSetEqual(expected_streams, synced_stream_names) + for stream in expected_streams: with self.subTest(stream=stream): - # expected values + # Expected values expected_primary_keys = self.expected_primary_keys()[stream] - # collect information for assertions from syncs 1 & 2 base on expected values + # Collect information for assertions from syncs 1 & 2 base on expected values record_count_sync = record_count_by_stream.get(stream, 0) primary_keys_list = [tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) for message in synced_records.get(stream).get('messages') if message.get('action') == 'upsert'] - # verify records are more than page size so multiple page is working - self.assertGreater(record_count_sync, page_size) - - primary_keys_list_1 = primary_keys_list[:page_size] - primary_keys_list_2 = primary_keys_list[page_size:2*page_size] - - primary_keys_page_1 = set(primary_keys_list_1) - primary_keys_page_2 = set(primary_keys_list_2) - - # Verify by private keys that data is unique for page - self.assertEqual(len(primary_keys_page_1), page_size) - self.assertTrue(primary_keys_page_1.isdisjoint(primary_keys_page_2)) + # Verify that for each stream you can get multiple pages of data + self.assertGreater(record_count_sync, page_size, + msg="The number of records is not over the stream max limit") + + # Chunk the replicated records (just primary keys) into expected pages + pages = [] + page_count = ceil(len(primary_keys_list) / page_size) + for page_index in range(page_count): + page_start = page_index * page_size + page_end = (page_index + 1) * page_size + pages.append(set(primary_keys_list[page_start:page_end])) + + # Verify by primary keys that data is unique for each page + for current_index, current_page in enumerate(pages): + with self.subTest(current_page_primary_keys=current_page): + + for other_index, other_page in enumerate(pages): + if current_index == other_index: + continue # don't compare the page to itself + + self.assertTrue( + current_page.isdisjoint(other_page), msg=f'other_page_primary_keys={other_page}' + ) \ No newline at end of file diff --git a/tests/test_github_parent_child_independednt.py b/tests/test_github_parent_child_independednt.py new file mode 100644 index 00000000..eb28da8c --- /dev/null +++ b/tests/test_github_parent_child_independednt.py @@ -0,0 +1,48 @@ +from tap_tester import runner, connections +from base import TestGithubBase + +class GithubParentChildIndependentTest(TestGithubBase): + + def name(self): + return "tap_tester_github_parent_child_test" + + def test_first_level_child_streams(self): + """ + Test case to verify that tap is working fine if only first level child streams are selected + """ + # Select first_level_child_streams only and run test + first_level_child_streams = {"team_members", "project_columns", "reviews", "review_comments", "pr_commits"} + self.run_test(first_level_child_streams) + + def test_second_level_child_streams(self): + """ + Test case to verify that tap is working fine if only second level child streams are selected + """ + # Select second_level_child_streams only and run test + second_level_child_streams = {"team_memberships", "project_cards"} + self.run_test(second_level_child_streams) + + def run_test(self, child_streams): + """ + Testing that tap is working fine if only child streams are selected + • Verify that if only child streams are selected then only child streams are replicated. + """ + # Instantiate connection + conn_id = connections.ensure_connection(self) + + # Run check mode + found_catalogs = self.run_and_verify_check_mode(conn_id) + + # Table and field selection + test_catalogs = [catalog for catalog in found_catalogs + if catalog.get('stream_name') in child_streams] + + self.perform_and_verify_table_and_field_selection(conn_id, test_catalogs) + + # Run initial sync + record_count_by_stream = self.run_and_verify_sync(conn_id) + synced_records = runner.get_records_from_target_output() + + # Verify no unexpected streams were replicated + synced_stream_names = set(synced_records.keys()) + self.assertSetEqual(child_streams, synced_stream_names) \ No newline at end of file diff --git a/tests/test_github_start_date.py b/tests/test_github_start_date.py index 34065255..5ea10ced 100644 --- a/tests/test_github_start_date.py +++ b/tests/test_github_start_date.py @@ -1,12 +1,13 @@ import os import requests -from tap_tester import connections, runner +from tap_tester import connections, runner, LOGGER from base import TestGithubBase from datetime import datetime, timedelta class GithubStartDateTest(TestGithubBase): + """Test that the start_date configuration is respected""" start_date_1 = "" start_date_2 = "" @@ -31,9 +32,23 @@ def test_run(self): # generate data for 'events' stream self.generate_data() + date_1 = '2020-04-01T00:00:00Z' + date_2 = '2021-10-08T00:00:00Z' + expected_stream_1 = {'commits'} + self.run_test(date_1, date_2, expected_stream_1) + + date_2 = '2022-07-13T00:00:00Z' + expected_stream_2 = {'issue_milestones'} + self.run_test(date_1, date_2, expected_stream_2) + + date_2 = '2022-05-06T00:00:00Z' + expected_stream_3 = {'pull_requests', 'pr_commits', 'review_comments', 'reviews'} + self.run_test(date_1, date_2, expected_stream_3) + + date_2 = '2022-01-27T00:00:00Z' # run the test for all the streams excluding 'events' stream # as for 'events' stream we have to use dynamic dates - self.run_test('2020-04-01T00:00:00Z', '2021-10-08T00:00:00Z', self.expected_streams() - {'events'}) + self.run_test(date_1, date_2, self.expected_streams() - expected_stream_1 - expected_stream_2 - expected_stream_3 - {'events'}) # As per the Documentation: https://docs.github.com/en/rest/reference/activity#events # the 'events' of past 90 days will only be returned @@ -45,13 +60,21 @@ def test_run(self): self.run_test(date_1, date_2, {'events'}) def run_test(self, date_1, date_2, streams): - """Instantiate start date according to the desired data set and run the test""" + """ + • Verify that a sync with a later start date has at least one record synced + and less records than the 1st sync with a previous start date + • Verify that each stream has less records than the earlier start date sync + • Verify all data from later start data has bookmark values >= start_date + • Verify that the minimum bookmark sent to the target for the later start_date sync + is greater than or equal to the start date + • Verify by primary key values, that all records in the 1st sync are included in the 2nd sync. + """ self.start_date_1 = date_1 self.start_date_2 = date_2 - start_date_1_epoch = self.dt_to_ts(self.start_date_1) - start_date_2_epoch = self.dt_to_ts(self.start_date_2) + start_date_1_epoch = self.dt_to_ts(self.start_date_1, self.START_DATE_FORMAT) + start_date_2_epoch = self.dt_to_ts(self.start_date_2, self.START_DATE_FORMAT) self.START_DATE = self.start_date_1 @@ -66,8 +89,7 @@ def run_test(self, date_1, date_2, streams): # run check mode found_catalogs_1 = self.run_and_verify_check_mode(conn_id_1) - # print(found_catalogs_1) - + # table and field selection test_catalogs_1_all_fields = [catalog for catalog in found_catalogs_1 if catalog.get('stream_name') in expected_streams] @@ -81,7 +103,7 @@ def run_test(self, date_1, date_2, streams): ### Update START DATE Between Syncs ########################################################################## - print("REPLICATION START DATE CHANGE: {} ===>>> {} ".format(self.START_DATE, self.start_date_2)) + LOGGER.info("REPLICATION START DATE CHANGE: {} ===>>> {} ".format(self.START_DATE, self.start_date_2)) self.START_DATE = self.start_date_2 ########################################################################## @@ -103,19 +125,11 @@ def run_test(self, date_1, date_2, streams): record_count_by_stream_2 = self.run_and_verify_sync(conn_id_2) synced_records_2 = runner.get_records_from_target_output() - # Verify the total number of records replicated in sync 1 is greater than the number - # of records replicated in sync 2 + # verify that sync 2 has at least one record synced and less records than sync 1 + self.assertGreater(sum(record_count_by_stream_2.values()), 0) self.assertGreater(sum(record_count_by_stream_1.values()), sum(record_count_by_stream_2.values())) for stream in expected_streams: - - # There are no data or not enough data for testing for below streams - # commit_comments, releases -> No data in tap-github repositery - # issue_milestones -> One data for isuue_milestones so not able to pass incremental cases - # projects, projects_columns, project_cards -> One record for project so not able to pass incremental cases - if stream in ["commit_comments", "releases", "issue_milestones", "projects", "project_columns", "project_cards"]: - continue - with self.subTest(stream=stream): # expected values @@ -136,29 +150,42 @@ def run_test(self, date_1, date_2, streams): primary_keys_sync_1 = set(primary_keys_list_1) primary_keys_sync_2 = set(primary_keys_list_2) + # verify that sync 2 has at least one record synced + self.assertGreater(record_count_sync_2, 0) + if expected_metadata.get(self.OBEYS_START_DATE): - # Sub stream fetch all data for records of related incremental super stream. - # Data of commit doesn't contain created_at or updated_at field. - # Data of isuue_milestomes contains bookmark key(due_on) with null value also. - if not self.is_full_table_sub_stream(stream) and stream != 'commits': - - # Expected bookmark key is one element in set so directly access it - bookmark_keys_list_1 = [message.get('data').get(next(iter(expected_bookmark_keys))) for message in synced_records_1.get(stream).get('messages') - if message.get('action') == 'upsert'] - bookmark_keys_list_2 = [message.get('data').get(next(iter(expected_bookmark_keys))) for message in synced_records_2.get(stream).get('messages') - if message.get('action') == 'upsert'] - - bookmark_key_sync_1 = set(bookmark_keys_list_1) - bookmark_key_sync_2 = set(bookmark_keys_list_2) - - # Verify bookmark key values are greater than or equal to start date of sync 1 - for bookmark_key_value in bookmark_key_sync_1: - self.assertGreaterEqual(self.dt_to_ts(bookmark_key_value), start_date_1_epoch) - - # Verify bookmark key values are greater than or equal to start date of sync 2 - for bookmark_key_value in bookmark_key_sync_2: - self.assertGreaterEqual(self.dt_to_ts(bookmark_key_value), start_date_2_epoch) + # Expected bookmark key is one element in set so directly access it + bookmark_keys_list_1 = [message.get('data').get(next(iter(expected_bookmark_keys))) for message in synced_records_1.get(stream).get('messages') + if message.get('action') == 'upsert'] + bookmark_keys_list_2 = [message.get('data').get(next(iter(expected_bookmark_keys))) for message in synced_records_2.get(stream).get('messages') + if message.get('action') == 'upsert'] + + bookmark_key_sync_1 = set(bookmark_keys_list_1) + bookmark_key_sync_2 = set(bookmark_keys_list_2) + + replication_key_format = self.RECORD_REPLICATION_KEY_FORMAT + # For events stream replication key value is coming in different format + if stream == 'events': + replication_key_format = self.EVENTS_RECORD_REPLICATION_KEY_FORMAT + + # Verify bookmark key values are greater than or equal to start date of sync 1 + for bookmark_key_value in bookmark_key_sync_1: + self.assertGreaterEqual( + self.dt_to_ts(bookmark_key_value, replication_key_format), start_date_1_epoch, + msg="Report pertains to a date prior to our start date.\n" + + "Sync start_date: {}\n".format(self.start_date_1) + + "Record date: {} ".format(bookmark_key_value) + ) + + # Verify bookmark key values are greater than or equal to start date of sync 2 + for bookmark_key_value in bookmark_key_sync_2: + self.assertGreaterEqual( + self.dt_to_ts(bookmark_key_value, replication_key_format), start_date_2_epoch, + msg="Report pertains to a date prior to our start date.\n" + + "Sync start_date: {}\n".format(self.start_date_2) + + "Record date: {} ".format(bookmark_key_value) + ) # Verify the number of records replicated in sync 1 is greater than the number # of records replicated in sync 2 for stream diff --git a/tests/test_github_sync.py b/tests/test_github_sync.py index d8bde66a..244cab7f 100644 --- a/tests/test_github_sync.py +++ b/tests/test_github_sync.py @@ -8,6 +8,14 @@ class TestGithubSync(TestGithubBase): def name(): return "tap_tester_github_sync_test" + def get_properties(self): + + return { + 'start_date' : '2021-10-01T00:00:00Z', + 'base_url': 'https://api.github.com', + 'repository': 'singer-io/test-repo' + } + def test_run(self): """ Testing that sync creates the appropriate catalog with valid metadata. diff --git a/tests/unittests/test_currently_syncing.py b/tests/unittests/test_currently_syncing.py new file mode 100644 index 00000000..044ae951 --- /dev/null +++ b/tests/unittests/test_currently_syncing.py @@ -0,0 +1,114 @@ +import unittest +from unittest import mock +from tap_github.sync import (update_currently_syncing_repo, update_currently_syncing, + get_ordered_stream_list, get_ordered_repos) + +class TestGetOrderedStreamList(unittest.TestCase): + """ + Test `get_ordered_stream_list` function to get ordered list od streams + """ + + streams_to_sync = ["commits", "pull_requests", "collaborators", "releases", "issue_labels", "assignees", "stargazers", "teams"] + + def test_currently_syncing_not_in_list(self): + """Test if currently syncing is not available in `streams_to_sync` list, function returns sorted streams_to_sync list.""" + expected_list = ['assignees', 'collaborators', 'commits', 'issue_labels', + 'pull_requests', 'releases', 'stargazers', 'teams'] + final_list = get_ordered_stream_list("issues", self.streams_to_sync) + + # Verify with expected ordered list of streams + self.assertEqual(final_list, expected_list) + + def test_for_interrupted_sync(self): + """Test when the sync was interrupted, the function returns ordered list of streams starting with 'currently_syncing' stream.""" + expected_list = ['releases', 'stargazers', 'teams', 'assignees', 'collaborators', + 'commits', 'issue_labels', 'pull_requests'] + final_list = get_ordered_stream_list("releases", self.streams_to_sync) + + # Verify with expected ordered list of streams + self.assertEqual(final_list, expected_list) + + def test_for_completed_sync(self): + """Test when sync was not interrupted, the function returns sorted streams_to_sync list.""" + expected_list = ['assignees', 'collaborators', 'commits', 'issue_labels', + 'pull_requests', 'releases', 'stargazers', 'teams'] + final_list = get_ordered_stream_list(None, self.streams_to_sync) + + # Verify with expected ordered list of streams + self.assertEqual(final_list, expected_list) + +class TestGetOrderedRepos(unittest.TestCase): + + """ + Test `get_ordered_repos` function to get ordered list repositories. + """ + repo_list = ["org/repo1", "org/repo2", "org/repo3", "org/repo4", "org/repo5"] + + def test_for_interupted_sync(self): + """Test when the sync was interrupted, the function returns ordered list of repositories starting with 'currently_syncing_repo'.""" + state = {"currently_syncing_repo": "org/repo3"} + expected_list = ["org/repo3", "org/repo4", "org/repo5", "org/repo1", "org/repo2"] + final_repo_list = get_ordered_repos(state, self.repo_list) + + # Verify with expected ordered list of repos + self.assertEqual(final_repo_list, expected_list) + + def test_currently_syncing_repo_removed_from_config(self): + """Test if currently syncing repo was removed from config.""" + state = {"currently_syncing_repo": "org/repo3"} + repo_list = ["org/repo1", "org/repo2", "org/repo4", "org/repo5"] + final_repo_list = get_ordered_repos(state, repo_list) + + # Verify with expected ordered list of repos + self.assertEqual(final_repo_list, repo_list) + + def test_for_completed_sync(self): + """Test when sync was not interrupted, the function returns repos list.""" + state = {} + final_repo_list = get_ordered_repos(state, self.repo_list) + + # Verify with expected ordered list of repos + self.assertEqual(final_repo_list, self.repo_list) + +@mock.patch("tap_github.sync.update_currently_syncing") +class TestUpdateCurrentlySyncingRepo(unittest.TestCase): + + """ + Test `update_currently_syncing_repo` function of sync. + """ + def test_adding_repo(self, mock_currently_syncing): + """Test for adding currently syncing repo in state""" + state = {"currently_syncing_repo": None} + update_currently_syncing_repo(state, "org/test-repo") + + # Verify with expected state + self.assertEqual(state, {"currently_syncing_repo": "org/test-repo"}) + + def test_flush_completed_repo(self, mock_currently_syncing): + """Test for removing currently syncing repo from state.""" + state = {"currently_syncing_repo": "org/test-repo"} + update_currently_syncing_repo(state, None) + + # Verify with expected state + self.assertEqual(state, {}) + +class TestUpdateCurrentlySyncing(unittest.TestCase): + + """ + Test `update_currently_syncing` function of sync. + """ + def test_update_syncing_stream(self): + """Test for adding currently syncing stream in state.""" + state = {"currently_syncing": "assignees"} + update_currently_syncing(state, "issues") + + # Verify with expected state + self.assertEqual(state, {"currently_syncing": "issues"}) + + def test_flush_currently_syncing(self): + """Test for removing currently syncing stream from state.""" + state = {"currently_syncing": "assignees"} + update_currently_syncing(state, None) + + # Verify with expected state + self.assertEqual(state, {}) diff --git a/tests/unittests/test_custom_domain.py b/tests/unittests/test_custom_domain.py new file mode 100644 index 00000000..139b2426 --- /dev/null +++ b/tests/unittests/test_custom_domain.py @@ -0,0 +1,29 @@ +import unittest +from unittest import mock +from tap_github.client import GithubClient, DEFAULT_DOMAIN + +@mock.patch('tap_github.GithubClient.verify_access_for_repo', return_value = None) +class TestCustomDomain(unittest.TestCase): + """ + Test custom domain is supported in client + """ + + def test_config_without_domain(self, mock_verify_access): + """ + Test if the domain is not given in the config + """ + mock_config = {'repository': 'singer-io/test-repo', "access_token": ""} + test_client = GithubClient(mock_config) + + # Verify domain in client is default + self.assertEqual(test_client.base_url, DEFAULT_DOMAIN) + + def test_config_with_domain(self, mock_verify_access): + """ + Test if the domain is given in the config + """ + mock_config = {'repository': 'singer-io/test-repo', "base_url": "http://CUSTOM-git.com", "access_token": ""} + test_client = GithubClient(mock_config) + + # Verify domain in client is from config + self.assertEqual(test_client.base_url, mock_config["base_url"]) diff --git a/tests/unittests/test_exception_handling.py b/tests/unittests/test_exception_handling.py index e2c86120..8c381054 100644 --- a/tests/unittests/test_exception_handling.py +++ b/tests/unittests/test_exception_handling.py @@ -1,10 +1,14 @@ from unittest import mock import tap_github +from tap_github.client import GithubClient, raise_for_error, ConflictError, BadRequestException, BadCredentialsException, AuthException, InternalServerError import unittest import requests +from parameterized import parameterized class Mockresponse: - def __init__(self, status_code, json, raise_error, headers={'X-RateLimit-Remaining': 1}, text=None, content=None): + """ Mock response object class.""" + + def __init__(self, status_code, json, raise_error, headers={'X-RateLimit-Remaining': 1}, content=None): self.status_code = status_code self.raise_error = raise_error self.text = json @@ -18,106 +22,101 @@ def raise_for_status(self): raise requests.HTTPError("Sample message") def json(self): + """ Response JSON method.""" return self.text +def get_mock_http_response(status_code, contents): + """Return http mock response.""" + response = requests.Response() + response.status_code = status_code + response._content = contents.encode() + return response + def get_response(status_code, json={}, raise_error=False, content=None): + """ Returns required mock response. """ return Mockresponse(status_code, json, raise_error, content=content) +@mock.patch("time.sleep") +@mock.patch("tap_github.client.GithubClient.verify_access_for_repo", return_value = None) @mock.patch("requests.Session.request") @mock.patch("singer.utils.parse_args") class TestExceptionHandling(unittest.TestCase): - def test_zero_content_length(self, mocked_parse_args, mocked_request): - mocked_request.return_value = get_response(400, raise_error = True, content='') - - try: - tap_github.authed_get("", "") - except tap_github.BadRequestException as e: - self.assertEqual(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") - - def test_400_error(self, mocked_parse_args, mocked_request): - mocked_request.return_value = get_response(400, raise_error = True) - - try: - tap_github.authed_get("", "") - except tap_github.BadRequestException as e: - self.assertEqual(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") - def test_401_error(self, mocked_parse_args, mocked_request): - mocked_request.return_value = get_response(401, raise_error = True) + """ + Test Error handling for `authed_get` method in client. + """ + + config = {"access_token": "", "repository": "org/test-repo, singer-io12/*"} + + def test_json_decoder_error(self, mocked_parse_args, mocked_request, mock_verify_access, mock_sleep): + """ + Verify handling of JSONDecoderError from the response. + """ + + mock_response = get_mock_http_response(409, "json_error") + + with self.assertRaises(ConflictError) as e: + raise_for_error(mock_response, "", "", "", True) + + # Verifying the message formed for the custom exception + self.assertEqual(str(e.exception), "HTTP-error-code: 409, Error: The request could not be completed due to a conflict with the current state of the server.") + + @parameterized.expand([ + [400, "The request is missing or has a bad parameter.", BadRequestException, '', {}, 1], + [401, "Invalid authorization credentials.", BadCredentialsException, '', {}, 1], + [403, "User doesn't have permission to access the resource.", AuthException, '', {}, 1], + [500, "An error has occurred at Github's end.", InternalServerError, '', {}, 5], + [301, "The resource you are looking for is moved to another URL.", tap_github.client.MovedPermanentlyError, '', {}, 1], + [304, "The requested resource has not been modified since the last time you accessed it.", tap_github.client.NotModifiedError, '', {}, 1], + [409, "The request could not be completed due to a conflict with the current state of the server.", tap_github.client.ConflictError, '', {}, 1], + [422, "The request was not able to process right now.", tap_github.client.UnprocessableError, '', {}, 1], + [501, "Unknown Error", tap_github.client.Server5xxError, '', {}, 5], + [429, "Too many requests occurred.", tap_github.client.TooManyRequests, '', {}, 5], + ]) + def test_error_message_and_call_count(self, mocked_parse_args, mocked_request, mock_verify_access, mock_sleep, erro_code, error_msg, error_class, content, json_msg, call_count): + """ + - Verify that `authed_get` raises an error with the proper message for different error codes. + - Verify that tap retries 5 times for Server5xxError and RateLimitExceeded error. + """ + mocked_request.return_value = get_response(erro_code, json = json_msg, raise_error = True, content = content) + test_client = GithubClient(self.config) + expected_error_message = "HTTP-error-code: {}, Error: {}".format(erro_code, error_msg) - try: - tap_github.authed_get("", "") - except tap_github.BadCredentialsException as e: - self.assertEqual(str(e), "HTTP-error-code: 401, Error: Invalid authorization credentials.") - - def test_403_error(self, mocked_parse_args, mocked_request): - mocked_request.return_value = get_response(403, raise_error = True) - - try: - tap_github.authed_get("", "") - except tap_github.AuthException as e: - self.assertEqual(str(e), "HTTP-error-code: 403, Error: User doesn't have permission to access the resource.") - - def test_404_error(self, mocked_parse_args, mocked_request): - json = {"message": "Not Found", "documentation_url": "https:/docs.github.com/"} - mocked_request.return_value = get_response(404, json = json, raise_error = True) + with self.assertRaises(error_class) as e: + test_client.authed_get("", "") - try: - tap_github.authed_get("", "") - except tap_github.NotFoundException as e: - self.assertEqual(str(e), "HTTP-error-code: 404, Error: The resource you have specified cannot be found. Please refer '{}' for more details.".format(json.get("documentation_url"))) + # Verifying the message formed for the custom exception + self.assertEqual(str(e.exception), expected_error_message) - def test_404_error_for_teams(self, mocked_parse_args, mocked_request): - json = {"message": "Not Found", "documentation_url": "https:/docs.github.com/"} - - try: - tap_github.raise_for_error(get_response(404, json = json, raise_error = True), "teams") - except tap_github.NotFoundException as e: - self.assertEqual(str(e), "HTTP-error-code: 404, Error: The resource you have specified cannot be found or it is a personal account repository. Please refer '{}' for more details.".format(json.get("documentation_url"))) - - def test_500_error(self, mocked_parse_args, mocked_request): - mocked_request.return_value = get_response(500, raise_error = True) - - try: - tap_github.authed_get("", "") - except tap_github.InternalServerError as e: - self.assertEqual(str(e), "HTTP-error-code: 500, Error: An error has occurred at Github's end.") - - def test_301_error(self, mocked_parse_args, mocked_request): - mocked_request.return_value = get_response(301, raise_error = True) + # Verify the call count for each error. + self.assertEquals(call_count, mocked_request.call_count) - try: - tap_github.authed_get("", "") - except tap_github.MovedPermanentlyError as e: - self.assertEqual(str(e), "HTTP-error-code: 301, Error: The resource you are looking for is moved to another URL.") - - def test_304_error(self, mocked_parse_args, mocked_request): - mocked_request.return_value = get_response(304, raise_error = True) - - try: - tap_github.authed_get("", "") - except tap_github.NotModifiedError as e: - self.assertEqual(str(e), "HTTP-error-code: 304, Error: The requested resource has not been modified since the last time you accessed it.") + @mock.patch("tap_github.client.LOGGER.warning") + def test_skip_404_error(self, mock_logger, mocked_parse_args, mocked_request, mock_verify_access, mock_sleep): + """ + Verify that `authed_get` skip 404 error and print the log message with the proper message. + """ + json = {"message": "Not Found", "documentation_url": "https:/docs.github.com/"} + mocked_request.return_value = get_response(404, json = json, raise_error = True) + expected_message = "HTTP-error-code: 404, Error: The resource you have specified cannot be found. Alternatively the access_token is not valid for the resource. Please refer '{}' for more details.".format(json.get("documentation_url")) + test_client = GithubClient(self.config) - def test_422_error(self, mocked_parse_args, mocked_request): - mocked_request.return_value = get_response(422, raise_error = True) + test_client.authed_get("", "") - try: - tap_github.authed_get("", "") - except tap_github.UnprocessableError as e: - self.assertEqual(str(e), "HTTP-error-code: 422, Error: The request was not able to process right now.") + # Verifying the message formed for the custom exception + self.assertEqual(mock_logger.mock_calls[0], mock.call(expected_message)) - def test_409_error(self, mocked_parse_args, mocked_request): - mocked_request.return_value = get_response(409, raise_error = True) + def test_raise_404_error_for_invalid_repo(self, mocked_parse_args, mocked_request, mock_verify_access, mock_sleep): + """ + Verify that `extract_repos_from_config` raises 404 error if invalid organization in given in the config. + """ + config = {'repository': 'singer-io12/*', "access_token": "TOKEN"} + test_client = GithubClient(config) + mocked_request.return_value = get_response(404, raise_error = True) - try: - tap_github.authed_get("", "") - except tap_github.ConflictError as e: - self.assertEqual(str(e), "HTTP-error-code: 409, Error: The request could not be completed due to a conflict with the current state of the server.") + with self.assertRaises(tap_github.client.NotFoundException) as e: + test_client.extract_repos_from_config() - def test_200_success(self, mocked_parse_args, mocked_request): - json = {"key": "value"} - mocked_request.return_value = get_response(200, json) + # Verifying the message formed for the custom exception + self.assertEqual(str(e.exception), "HTTP-error-code: 404, Error: Please check the organization name 'singer-io12' or you do not have sufficient permissions to access this organization.") - resp = tap_github.authed_get("", "") - self.assertEqual(json, resp.json()) diff --git a/tests/unittests/test_extract_repos_from_config.py b/tests/unittests/test_extract_repos_from_config.py index 4a205696..9d5a84f4 100644 --- a/tests/unittests/test_extract_repos_from_config.py +++ b/tests/unittests/test_extract_repos_from_config.py @@ -1,32 +1,66 @@ +from email.headerregistry import ParameterizedMIMEHeader import unittest -import tap_github +from unittest import mock +from tap_github.client import GithubClient, GithubException +from parameterized import parameterized -@unittest.mock.patch('tap_github.get_all_repos') +@mock.patch('tap_github.client.GithubClient.verify_access_for_repo') +@mock.patch('tap_github.client.GithubClient.get_all_repos') class TestExtractReposFromConfig(unittest.TestCase): + """ + Test `extract_repos_from_config` method from client. + """ - def test_single_repo(self, mocked_get_all_repos): - config = {'repository': 'singer-io/test-repo'} - expected_repositories = ['singer-io/test-repo'] - self.assertEqual(expected_repositories, tap_github.extract_repos_from_config(config)) - - def test_multiple_repos(self, mocked_get_all_repos): - config = {'repository': 'singer-io/test-repo singer-io/tap-github'} - expected_repositories = ['singer-io/test-repo', 'singer-io/tap-github'] - self.assertEqual(expected_repositories, tap_github.extract_repos_from_config(config)) - - def test_org_all_repos(self, mocked_get_all_repos): - config = {'repository': 'singer-io/test-repo test-org/*'} - expected_repositories = [ - 'singer-io/test-repo', - 'test-org/repo1', - 'test-org/repo2', - 'test-org/repo3' - ] - mocked_get_all_repos.return_value = [ - 'test-org/repo1', - 'test-org/repo2', - 'test-org/repo3' - ] - - self.assertEqual(expected_repositories, tap_github.extract_repos_from_config(config)) + @parameterized.expand([ + ['test_single_repo', 'singer-io/test-repo', [], ['singer-io/test-repo'], {'singer-io'}], + ['test_multiple_repos', 'singer-io/test-repo singer-io/tap-github', [], ['singer-io/tap-github', 'singer-io/test-repo'], {'singer-io'}], + ['test_org_all_repos', 'singer-io/test-repo test-org/*', ['test-org/repo1', 'test-org/repo2'], ['singer-io/test-repo', 'test-org/repo1', 'test-org/repo2'], {'singer-io', 'test-org'}] + ]) + def test_extract_repos_from_config(self, mocked_get_all_repos, mock_verify_access, name, repo_paths, all_repos, expected_repos, expected_orgs): + """ + Test `extract_repos_from_config` if only one repo path is given in config. + """ + config = {'repository': repo_paths, "access_token": "TOKEN"} + test_client = GithubClient(config) + mocked_get_all_repos.return_value = all_repos + + actual_repos, actual_orgs = test_client.extract_repos_from_config() + # Verify list of repo path with expected + self.assertEqual((sorted(expected_repos), sorted(expected_orgs)), (sorted(actual_repos), sorted(actual_orgs))) + + @parameterized.expand([ + ['test_organization_without_repo_in_config', 'singer-io', ['singer-io']], + ['test_organization_without_repo_with_slash_in_config', 'singer-io/', ['singer-io/']], + ['test_organization_with_only_slash_in_config', '/', ['/']], + ['test_organization_with_multiple_wrong_formatted_repo_path_in_config', 'singer-io/ /tap-github', ["singer-io/", "/tap-github"]] + ]) + def test_organization_without_repo_in_config(self, mocked_get_all_repos, mock_verify_access, name, repo_paths, expected_repo): + """ + Verify that the tap throws an exception with a proper error message for invalid organization names. + """ + config = {'repository': repo_paths, "access_token": "TOKEN"} + test_client = GithubClient(config) + expected_error_message = "Please provide valid organization/repository for: {}".format(sorted(expected_repo)) + with self.assertRaises(GithubException) as exc: + test_client.extract_repos_from_config() + + # Verify that we get expected error message + self.assertEqual(str(exc.exception), expected_error_message) + + @mock.patch('tap_github.client.LOGGER.warning') + def test_organization_with_duplicate_repo_paths_in_config(self, mock_warn, mocked_get_all_repos, mock_verify_access): + """ + Verify that the tap logs proper warning message for duplicate repos in config and returns list without duplicates + """ + config = {'repository': 'singer-io/tap-github singer-io/tap-github singer-io/test-repo', "access_token": "TOKEN"} + test_client = GithubClient(config) + expected_repos = ['singer-io/tap-github', 'singer-io/test-repo'] + actual_repos, orgs = test_client.extract_repos_from_config() + expected_message = "Duplicate repositories found: %s and will be synced only once." + + # Verify that the logger is called with expected error message + mock_warn.assert_called_with(expected_message, ['singer-io/tap-github']) + + # Verify that extract_repos_from_config() returns repos without duplicates + self.assertEqual(sorted(expected_repos), sorted(actual_repos)) \ No newline at end of file diff --git a/tests/unittests/test_formatting_dates.py b/tests/unittests/test_formatting_dates.py deleted file mode 100644 index 72a70925..00000000 --- a/tests/unittests/test_formatting_dates.py +++ /dev/null @@ -1,120 +0,0 @@ -import unittest -from unittest import mock -import singer -import tap_github.__init__ as tap_github - -class Mockresponse: - def __init__(self, resp, not_list=False): - self.not_list = not_list - self.json_data = resp - self.content = "github" - - def json(self): - if self.not_list: - return self.json_data - return [self.json_data] - -def get_response(json, not_list=False): - if not_list: - yield Mockresponse(json, not_list) - else: - yield Mockresponse(resp=json) - -@mock.patch("tap_github.__init__.authed_get_all_pages") -class TestRateLimit(unittest.TestCase): - - def test_due_on_none_without_state(self, mocked_request): - """ - "due_on" is "None", - so we will get 1 records - """ - json = {"due_on": None} - - mocked_request.return_value = get_response(json) - - init_state = {} - repo_path = "singer-io/tap-github" - - final_state = tap_github.get_all_issue_milestones({}, repo_path, init_state, {}, "") - # as we will get 1 record and initial bookmark is empty, checking that if bookmark exists in state file returned - self.assertTrue(final_state["bookmarks"][repo_path]["issue_milestones"]["since"]) - - def test_due_on_none_with_state(self, mocked_request): - """ - "due_on" is "None", - so we will get 1 records - """ - json = {"due_on": None} - - mocked_request.return_value = get_response(json) - - repo_path = "singer-io/tap-github" - init_state = {'bookmarks': {'singer-io/tap-github': {'issue_milestones': {'since': '2021-05-05T07:20:36.887412Z'}}}} - init_bookmark = singer.utils.strptime_to_utc(init_state["bookmarks"][repo_path]["issue_milestones"]["since"]) - - final_state = tap_github.get_all_issue_milestones({}, repo_path, init_state, {}, "") - last_bookmark = singer.utils.strptime_to_utc(final_state["bookmarks"][repo_path]["issue_milestones"]["since"]) - # as we will get 1 record, final bookmark will be greater than initial bookmark - self.assertGreater(last_bookmark, init_bookmark) - - def test_due_on_not_none_1(self, mocked_request): - """ - Bookmark value is smaller than "due_on", - so we will get 1 records - """ - json = {"due_on": "2021-05-07T07:00:00Z"} - - mocked_request.return_value = get_response(json) - mocked_request.singer.write_record.side_effect = None - - repo_path = "singer-io/tap-github" - init_state = {'bookmarks': {'singer-io/tap-github': {'issue_milestones': {'since': '2021-05-05T07:20:36.887412Z'}}}} - init_bookmark = singer.utils.strptime_to_utc(init_state["bookmarks"][repo_path]["issue_milestones"]["since"]) - - final_state = tap_github.get_all_issue_milestones({}, repo_path, init_state, {}, "") - last_bookmark = singer.utils.strptime_to_utc(final_state["bookmarks"][repo_path]["issue_milestones"]["since"]) - # as we will get 1 record, final bookmark will be greater than initial bookmark - self.assertGreater(last_bookmark, init_bookmark) - - def test_due_on_not_none_2(self, mocked_request): - """ - Bookmark value is greater than "due_on", - so we will get 0 records - """ - json = {"due_on": "2021-05-07T07:00:00Z"} - - mocked_request.return_value = get_response(json) - - repo_path = "singer-io/tap-github" - init_state = {'bookmarks': {'singer-io/tap-github': {'issue_milestones': {'since': '2021-05-08T07:20:36.887412Z'}}}} - init_bookmark = init_state["bookmarks"][repo_path]["issue_milestones"]["since"] - - final_state = tap_github.get_all_issue_milestones({}, repo_path, init_state, {}, "") - # as we will get 0 records, initial and final bookmark will be same - self.assertEqual(init_bookmark, final_state["bookmarks"][repo_path]["issue_milestones"]["since"]) - - @mock.patch("singer.write_record") - def test_data_containing_both_values(self, mocked_write_record, mocked_request): - """ - As we have 3 records here, - -> due_on = None - -> due_on > Bookmark - -> due_on < Bookmark - so, here we will get 2 records, - -> due_on = None - -> due_on > Bookmark - """ - json = [{"due_on": "2021-05-07T07:00:00Z"}, {"due_on": "2021-05-09T07:00:00Z"}, {"due_on": None}] - - mocked_request.return_value = get_response(json, True) - - repo_path = "singer-io/tap-github" - init_state = {'bookmarks': {'singer-io/tap-github': {'issue_milestones': {'since': '2021-05-08T07:20:36.887412Z'}}}} - init_bookmark = singer.utils.strptime_to_utc(init_state["bookmarks"][repo_path]["issue_milestones"]["since"]) - - final_state = tap_github.get_all_issue_milestones({}, repo_path, init_state, {}, "") - last_bookmark = singer.utils.strptime_to_utc(final_state["bookmarks"][repo_path]["issue_milestones"]["since"]) - # as we will get 2 record, final bookmark will be greater than initial bookmark - self.assertGreater(last_bookmark, init_bookmark) - # as we will get 2 record, write_records will also be called 2 times - self.assertEqual(mocked_write_record.call_count, 2) diff --git a/tests/unittests/test_get_all_repos.py b/tests/unittests/test_get_all_repos.py index c8ca7a0b..9235acad 100644 --- a/tests/unittests/test_get_all_repos.py +++ b/tests/unittests/test_get_all_repos.py @@ -1,9 +1,10 @@ import unittest +from unittest import mock import requests import requests_mock import simplejson as json -import tap_github +from tap_github.client import GithubClient from itertools import cycle @@ -12,12 +13,23 @@ ADAPTER = requests_mock.Adapter() SESSION.mount('mock://', ADAPTER) +class MockResponse(): + """ Mock response object class.""" -@unittest.mock.patch('tap_github.verify_repo_access') -@unittest.mock.patch('tap_github.authed_get_all_pages') + def __init__(self, links): + self.links = links + +@mock.patch('tap_github.client.GithubClient.verify_repo_access') +@mock.patch('tap_github.client.GithubClient.authed_get_all_pages') class TestGetAllRepos(unittest.TestCase): + """ + Test `get_all_repos` method from client. + """ + config = {"access_token": "", "repository": "test-org/repo1 test-org/repo2 test-org/repo3"} def test_single_organization(self, mocked_authed_get_all_pages, mocked_verify_repo_access): + """Verify for single organisation with all repos.""" + test_client = GithubClient(self.config) orgs = ['test-org/*'] repos = ['repo1', 'repo2', 'repo3'] @@ -39,9 +51,12 @@ def test_single_organization(self, mocked_authed_get_all_pages, mocked_verify_re ] mocked_authed_get_all_pages.return_value = [mocked_response] - self.assertEqual(expected_repositories, tap_github.get_all_repos(orgs)) + # Verify expected list of repo paths + self.assertEqual(expected_repositories, test_client.get_all_repos(orgs)) def test_multiple_organizations(self, mocked_authed_get_all_pages, mocked_verify_repo_access): + """Verify for multiple organisations with all repos.""" + test_client = GithubClient(self.config) orgs = ['test-org/*', 'singer-io/*'] repos = ['repo1', 'repo2', 'repo3'] @@ -58,7 +73,7 @@ def test_multiple_organizations(self, mocked_authed_get_all_pages, mocked_verify mocked_response = SESSION.get(mocked_url) mocked_authed_get_all_pages.return_value = [mocked_response] - call_response = tap_github.get_all_repos([org]) + call_response = test_client.get_all_repos([org]) side_effect.extend(call_response) @@ -71,4 +86,42 @@ def test_multiple_organizations(self, mocked_authed_get_all_pages, mocked_verify 'singer-io/repo3' ] + # Verify expected list of repo paths self.assertListEqual(expected_repositories, side_effect) + +@mock.patch('tap_github.client.GithubClient.verify_repo_access') +@mock.patch('tap_github.client.GithubClient.authed_get') +class TestAuthedGetAllPages(unittest.TestCase): + """ + Test `authed_get_all_pages` method from client. + """ + config = {"access_token": "", "repository": "test-org/repo1"} + + def test_for_one_page(self, mock_auth_get, mock_verify_access): + + """Verify `authed_get` is called only once if one page is available.""" + + test_client = GithubClient(self.config) + mock_auth_get.return_value = MockResponse({}) + + list(test_client.authed_get_all_pages("", "mock_url", {})) + + # Verify `auth_get` call count + self.assertEqual(mock_auth_get.call_count, 1) + + def test_for_multiple_pages(self, mock_auth_get, mock_verify_access): + + """Verify `authed_get` is called equal number times as pages available.""" + + test_client = GithubClient(self.config) + mock_auth_get.side_effect = [MockResponse({"next": {"url": "mock_url_2"}}),MockResponse({"next": {"url": "mock_url_3"}}),MockResponse({})] + + list(test_client.authed_get_all_pages("", "mock_url_1", {})) + + # Verify `auth_get` call count + self.assertEqual(mock_auth_get.call_count, 3) + + # Verify `auth_get` calls with expected url + self.assertEqual(mock_auth_get.mock_calls[0], mock.call("", "mock_url_1", {}, '', True)) + self.assertEqual(mock_auth_get.mock_calls[1], mock.call("", "mock_url_2", {}, '', True)) + self.assertEqual(mock_auth_get.mock_calls[2], mock.call("", "mock_url_3", {}, '', True)) diff --git a/tests/unittests/test_get_streams_and_state_translate.py b/tests/unittests/test_get_streams_and_state_translate.py new file mode 100644 index 00000000..c862f7b3 --- /dev/null +++ b/tests/unittests/test_get_streams_and_state_translate.py @@ -0,0 +1,135 @@ +import unittest +from tap_github.sync import get_selected_streams, translate_state, get_stream_to_sync +from parameterized import parameterized + +def get_stream_catalog(stream_name, selected_in_metadata = False): + """Return catalog for stream""" + return { + "schema":{}, + "tap_stream_id": stream_name, + "key_properties": [], + "metadata": [ + { + "breadcrumb": [], + "metadata":{"selected": selected_in_metadata} + } + ] + } + +class TestTranslateState(unittest.TestCase): + """ + Testcase for `translate_state` in sync + """ + + catalog = { + "streams": [ + get_stream_catalog("comments"), + get_stream_catalog("releases"), + get_stream_catalog("issue_labels"), + get_stream_catalog("issue_events") + ] + } + + def test_newer_format_state_with_repo_name(self): + """Verify that `translate_state` return the state itself if a newer format bookmark is found.""" + state = { + "bookmarks": { + "org/test-repo" : { + "comments": {"since": "2019-01-01T00:00:00Z"} + }, + "org/test-repo2" : {} + } + } + + final_state = translate_state(state, self.catalog, ["org/test-repo", "org/test-repo2"]) + self.assertEqual(state, dict(final_state)) + + def test_older_format_state_without_repo_name(self): + """Verify that `translate_state` migrate each stream's bookmark into the repo name""" + older_format_state = { + "bookmarks": { + "comments": {"since": "2019-01-01T00:00:00Z"} + } + } + expected_state = { + "bookmarks": { + "org/test-repo" : { + "comments": {"since": "2019-01-01T00:00:00Z"} + }, + "org/test-repo2" : { + "comments": {"since": "2019-01-01T00:00:00Z"} + } + } + } + final_state = translate_state(older_format_state, self.catalog, ["org/test-repo", "org/test-repo2"]) + self.assertEqual(expected_state, dict(final_state)) + + def test_with_empty_state(self): + """Verify for empty state""" + + final_state = translate_state({}, self.catalog, ["org/test-repo"]) + + self.assertEqual({}, dict(final_state)) + + def test_state_with_no_previous_repo_name_newer_format_bookmark(self): + """Verify that `translate_state` return the existing state if all existing repo unselected in the current sync.""" + newer_format_state = { + "bookmarks": { + "org/test-repo" : { + "comments": {"since": "2019-01-01T00:00:00Z"} + }, + "org/test-repo2" : {} + } + } + final_state = translate_state(newer_format_state, self.catalog, ["org/test-repo3", "org/test-repo4"]) + self.assertEqual(newer_format_state, dict(final_state)) + + def test_state_with_no_previous_repo_name_old_format_bookmark(self): + """Verify that `translate_state` migrate each stream's bookmark into the repo name""" + older_format_state = { + "bookmarks": { + "comments": {"since": "2019-01-01T00:00:00Z"} + } + } + expected_state = { + "bookmarks": { + "org/test-repo3" : { + "comments": {"since": "2019-01-01T00:00:00Z"} + }, + "org/test-repo4" : { + "comments": {"since": "2019-01-01T00:00:00Z"} + } + } + } + final_state = translate_state(older_format_state, self.catalog, ["org/test-repo3", "org/test-repo4"]) + self.assertEqual(expected_state, dict(final_state)) + +class TestGetStreamsToSync(unittest.TestCase): + """ + Testcase for `get_stream_to_sync` in sync + """ + + def get_catalog(self, parent=False, mid_child = False, child = False): + return { + "streams": [ + get_stream_catalog("projects", selected_in_metadata=parent), + get_stream_catalog("project_columns", selected_in_metadata=mid_child), + get_stream_catalog("project_cards", selected_in_metadata=child), + get_stream_catalog("teams", selected_in_metadata=parent), + get_stream_catalog("team_members", selected_in_metadata=mid_child), + get_stream_catalog("team_memberships", selected_in_metadata=child), + get_stream_catalog("assignees", selected_in_metadata=parent), + ] + } + + @parameterized.expand([ + ['test_parent_selected', ["assignees", "projects", "teams"], True, False, False], + ['test_mid_child_selected', ["projects", "project_columns", "teams", "team_members"], False, True, False], + ['test_lowest_child_selected', ["projects", "project_columns", "project_cards", "teams", "team_members", "team_memberships"], False, False, True] + ]) + def test_stream_selection(self, name, expected_streams, is_parent, is_mid_child, is_child): + """Test that if an only child or mid-child is selected in the catalog, then `get_stream_to_sync` returns the parent stream also""" + catalog = self.get_catalog(parent=is_parent, mid_child=is_mid_child, child=is_child) + sync_streams = get_stream_to_sync(catalog) + + self.assertEqual(sync_streams, expected_streams) diff --git a/tests/unittests/test_key_error.py b/tests/unittests/test_key_error.py deleted file mode 100644 index 7e5bb28c..00000000 --- a/tests/unittests/test_key_error.py +++ /dev/null @@ -1,160 +0,0 @@ -import unittest -from unittest import mock -import tap_github.__init__ as tap_github - -class Mockresponse: - def __init__(self, resp): - self.json_data = resp - self.content = "github" - - def json(self): - return [(self.json_data)] - -def get_response(json): - yield Mockresponse(resp=json) - -@mock.patch("tap_github.__init__.authed_get_all_pages") -class TestKeyErrorSlug(unittest.TestCase): - - @mock.patch("tap_github.__init__.get_all_team_members") - def test_slug_sub_stream_selected_slug_selected(self, mocked_team_members, mocked_request): - json = {"key": "value", "slug": "team-slug"} - - mocked_request.return_value = get_response(json) - - schemas = {"teams": "None", "team_members": "None"} - mdata_slug = [ - { - 'breadcrumb': [], - 'metadata': {'selected': True, 'table-key-properties': ['id']} - }, - { - 'breadcrumb': ['properties', 'slug'], - 'metadata': {'inclusion': 'available'} - }, - { - "breadcrumb": [ "properties", "name"], - "metadata": {"inclusion": "available"} - }] - mdata = {"teams": mdata_slug, "team_members": mdata_slug} - tap_github.get_all_teams(schemas, "tap-github", {}, mdata, "") - self.assertEqual(mocked_team_members.call_count, 1) - - @mock.patch("tap_github.__init__.get_all_team_members") - def test_slug_sub_stream_not_selected_slug_selected(self, mocked_team_members, mocked_request): - json = {"key": "value", "slug": "team-slug"} - - mocked_request.return_value = get_response(json) - - schemas = {"teams": "None"} - mdata = {"teams": [ - { - 'breadcrumb': [], - 'metadata': {'selected': True, 'table-key-properties': ['id']} - }, - { - 'breadcrumb': ['properties', 'slug'], - 'metadata': {'inclusion': 'available'} - }, - { - "breadcrumb": [ "properties", "name"], - "metadata": {"inclusion": "available"} - }]} - tap_github.get_all_teams(schemas, "tap-github", {}, mdata, "") - self.assertEqual(mocked_team_members.call_count, 0) - - @mock.patch("tap_github.__init__.get_all_team_members") - def test_slug_sub_stream_selected_slug_not_selected(self, mocked_team_members, mocked_request): - json = {"key": "value", "slug": "team-slug"} - - mocked_request.return_value = get_response(json) - - schemas = {"teams": "None", "team_members": "None"} - mdata_slug = [ - { - 'breadcrumb': [], - 'metadata': {'selected': True, 'table-key-properties': ['id']} - }, - { - 'breadcrumb': ['properties', 'slug'], - 'metadata': {'inclusion': 'available', 'selected': False} - }, - { - "breadcrumb": [ "properties", "name"], - "metadata": {"inclusion": "available"} - }] - mdata = {"teams": mdata_slug, "team_members": mdata_slug} - tap_github.get_all_teams(schemas, "tap-github", {}, mdata, "") - self.assertEqual(mocked_team_members.call_count, 1) - - @mock.patch("tap_github.__init__.get_all_team_members") - def test_slug_sub_stream_not_selected_slug_not_selected(self, mocked_team_members, mocked_request): - json = {"key": "value", "slug": "team-slug"} - - mocked_request.return_value = get_response(json) - - schemas = {"teams": "None"} - mdata = {"teams": [ - { - 'breadcrumb': [], - 'metadata': {'selected': True, 'table-key-properties': ['id']} - }, - { - 'breadcrumb': ['properties', 'slug'], - 'metadata': {'inclusion': 'available', 'selected': False} - }, - { - "breadcrumb": [ "properties", "name"], - "metadata": {"inclusion": "available"} - }]} - tap_github.get_all_teams(schemas, "tap-github", {}, mdata, "") - self.assertEqual(mocked_team_members.call_count, 0) - -@mock.patch("tap_github.__init__.authed_get_all_pages") -class TestKeyErrorUser(unittest.TestCase): - - @mock.patch("singer.write_record") - def test_user_not_selected_in_stargazers(self, mocked_write_records, mocked_request): - json = {"key": "value", "user": {"id": 1}} - - mocked_request.return_value = get_response(json) - - schemas = {"teams": "None"} - mdata = [ - { - 'breadcrumb': [], - 'metadata': {'selected': True, 'table-key-properties': ['user_id']} - }, - { - "breadcrumb": ["properties", "user"], - "metadata": {"inclusion": "available", "selected": False} - }, - { - "breadcrumb": ["properties", "starred_at"], - "metadata": {"inclusion": "available"} - }] - tap_github.get_all_stargazers(schemas, "tap-github", {}, mdata, "") - self.assertEqual(mocked_write_records.call_count, 1) - - @mock.patch("singer.write_record") - def test_user_selected_in_stargazers(self, mocked_write_records, mocked_request): - json = {"key": "value", "user": {"id": 1}} - - mocked_request.return_value = get_response(json) - - schemas = {"stargazers": "None"} - mdata = [ - { - 'breadcrumb': [], - 'metadata': {'selected': True, 'table-key-properties': ['user_id']} - }, - { - "breadcrumb": ["properties", "user"], - "metadata": {"inclusion": "available"} - }, - { - "breadcrumb": ["properties", "starred_at"], - "metadata": {"inclusion": "available"} - }] - tap_github.get_all_stargazers(schemas, "tap-github", {}, mdata, "") - self.assertEqual(mocked_write_records.call_count, 1) diff --git a/tests/unittests/test_main.py b/tests/unittests/test_main.py new file mode 100644 index 00000000..44d5d22c --- /dev/null +++ b/tests/unittests/test_main.py @@ -0,0 +1,103 @@ +import unittest +from unittest import mock +from tap_github import main +from tap_github.discover import discover + +class MockArgs: + """Mock args object class""" + + def __init__(self, config = None, properties = None, state = None, discover = False) -> None: + self.config = config + self.properties = properties + self.state = state + self.discover = discover + +@mock.patch("tap_github.GithubClient") +@mock.patch("singer.utils.parse_args") +class TestDiscoverMode(unittest.TestCase): + """ + Test main function for discover mode + """ + + mock_config = {"start_date": "", "access_token": ""} + + @mock.patch("tap_github._discover") + def test_discover_with_config(self, mock_discover, mock_args, mock_verify_access): + """Test `_discover` function is called for discover mode""" + mock_discover.return_value = dict() + mock_args.return_value = MockArgs(discover = True, config = self.mock_config) + main() + + self.assertTrue(mock_discover.called) + + +@mock.patch("tap_github.GithubClient") +@mock.patch("singer.utils.parse_args") +@mock.patch("tap_github._sync") +class TestSyncMode(unittest.TestCase): + """ + Test main function for sync mode + """ + + mock_config = {"start_date": "", "access_token": ""} + mock_catalog = {"streams": [{"stream": "teams", "schema": {}, "metadata": {}}]} + + @mock.patch("tap_github._discover") + def test_sync_with_properties(self, mock_discover, mock_sync, mock_args, mock_client): + """Test sync mode with properties given in args""" + + mock_client.return_value = "mock_client" + mock_args.return_value = MockArgs(config=self.mock_config, properties=self.mock_catalog) + main() + + # Verify `_sync` is called with expected arguments + mock_sync.assert_called_with("mock_client", self.mock_config, {}, self.mock_catalog) + + # verify `_discover` function is not called + self.assertFalse(mock_discover.called) + + @mock.patch("tap_github._discover") + def test_sync_without_properties(self, mock_discover, mock_sync, mock_args, mock_client): + """Test sync mode without properties given in args""" + + mock_discover.return_value = {"schema": "", "metadata": ""} + mock_client.return_value = "mock_client" + mock_args.return_value = MockArgs(config=self.mock_config) + main() + + # Verify `_sync` is called with expected arguments + mock_sync.assert_called_with("mock_client", self.mock_config, {}, {"schema": "", "metadata": ""}) + + # verify `_discover` function is called + self.assertTrue(mock_discover.called) + + def test_sync_with_state(self, mock_sync, mock_args, mock_client): + """Test sync mode with state given in args""" + mock_state = {"bookmarks": {"projec ts": ""}} + mock_client.return_value = "mock_client" + mock_args.return_value = MockArgs(config=self.mock_config, properties=self.mock_catalog, state=mock_state) + main() + + # Verify `_sync` is called with expected arguments + mock_sync.assert_called_with("mock_client", self.mock_config, mock_state, self.mock_catalog) + +@mock.patch("tap_github.GithubClient") +class TestDiscover(unittest.TestCase): + """Test `discover` function.""" + + def test_discover(self, mock_client): + + return_catalog = discover(mock_client) + + self.assertIsInstance(return_catalog, dict) + + @mock.patch("tap_github.discover.Schema") + @mock.patch("tap_github.discover.LOGGER.error") + def test_discover_error_handling(self, mock_logger, mock_schema, mock_client): + """Test discover function if exception arises.""" + mock_schema.from_dict.side_effect = [Exception] + with self.assertRaises(Exception): + discover(mock_client) + + # Verify logger called 3 times when an exception arises. + self.assertEqual(mock_logger.call_count, 3) diff --git a/tests/unittests/test_rate_limit.py b/tests/unittests/test_rate_limit.py index 7fb01873..987c60a0 100644 --- a/tests/unittests/test_rate_limit.py +++ b/tests/unittests/test_rate_limit.py @@ -1,17 +1,26 @@ -import tap_github.__init__ as tap_github +import tap_github +from tap_github.client import rate_throttling, GithubException import unittest from unittest import mock import time import requests +DEFAULT_SLEEP_SECONDS = 600 def api_call(): return requests.get("https://api.github.com/rate_limit") @mock.patch('time.sleep') class TestRateLimit(unittest.TestCase): + """ + Test `rate_throttling` function from client. + """ + config = {"access_token": "", "repository": "singer-io/tap-github"} def test_rate_limt_wait(self, mocked_sleep): + """ + Test `rate_throttling` for 'sleep_time' less than `MAX_SLEEP_SECONDS` + """ mocked_sleep.side_effect = None @@ -19,13 +28,17 @@ def test_rate_limt_wait(self, mocked_sleep): resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 120 resp.headers["X-RateLimit-Remaining"] = 0 - tap_github.rate_throttling(resp) + rate_throttling(resp, DEFAULT_SLEEP_SECONDS) + # Verify `time.sleep` is called with expected seconds in response mocked_sleep.assert_called_with(120) self.assertTrue(mocked_sleep.called) def test_rate_limit_exception(self, mocked_sleep): + """ + Test `rate_throttling` for 'sleep_time' greater than `MAX_SLEEP_SECONDS` + """ mocked_sleep.side_effect = None @@ -33,13 +46,16 @@ def test_rate_limit_exception(self, mocked_sleep): resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 601 resp.headers["X-RateLimit-Remaining"] = 0 - try: - tap_github.rate_throttling(resp) - except tap_github.RateLimitExceeded as e: - self.assertEqual(str(e), "API rate limit exceeded, please try after 601 seconds.") + # Verify exception is raised with proper message + with self.assertRaises(tap_github.client.RateLimitExceeded) as e: + rate_throttling(resp, DEFAULT_SLEEP_SECONDS) + self.assertEqual(str(e.exception), "API rate limit exceeded, please try after 601 seconds.") def test_rate_limit_not_exceeded(self, mocked_sleep): + """ + Test `rate_throttling` if sleep time does not exceed limit + """ mocked_sleep.side_effect = None @@ -47,6 +63,20 @@ def test_rate_limit_not_exceeded(self, mocked_sleep): resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 10 resp.headers["X-RateLimit-Remaining"] = 5 - tap_github.rate_throttling(resp) + rate_throttling(resp, DEFAULT_SLEEP_SECONDS) + # Verify that `time.sleep` is not called self.assertFalse(mocked_sleep.called) + + def test_rate_limt_header_not_found(self, mocked_sleep): + """ + Test that the `rate_throttling` function raises an exception if `X-RateLimit-Reset` key is not found in the header. + """ + resp = api_call() + resp.headers={} + + with self.assertRaises(GithubException) as e: + rate_throttling(resp, DEFAULT_SLEEP_SECONDS) + + # Verifying the message formed for the invalid base URL + self.assertEqual(str(e.exception), "The API call using the specified base url was unsuccessful. Please double-check the provided base URL.") diff --git a/tests/unittests/test_stargazers_full_table.py b/tests/unittests/test_stargazers_full_table.py deleted file mode 100644 index 47cb7089..00000000 --- a/tests/unittests/test_stargazers_full_table.py +++ /dev/null @@ -1,14 +0,0 @@ -import unittest -from unittest import mock -import tap_github.__init__ as tap_github - -@mock.patch("tap_github.__init__.authed_get_all_pages") -class TestStargazersFullTable(unittest.TestCase): - - def test_stargazers_without_query_params(self, mocked_request): - - schemas = {"stargazers": "None"} - - tap_github.get_all_stargazers(schemas, "tap-github", {}, {}, "") - - mocked_request.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/stargazers", mock.ANY) diff --git a/tests/unittests/test_start_date_bookmark.py b/tests/unittests/test_start_date_bookmark.py deleted file mode 100644 index 8cfb4b18..00000000 --- a/tests/unittests/test_start_date_bookmark.py +++ /dev/null @@ -1,42 +0,0 @@ -import tap_github -import unittest -from unittest import mock - -@mock.patch("singer.bookmarks.get_bookmark") -class TestBookmarkStartDate(unittest.TestCase): - - def test_no_bookmark_no_start_date(self, mocked_get_bookmark): - # Start date is none and bookmark is not present then None should be return. - mocked_get_bookmark.return_value = None - start_date = None - bookmark_key = 'since' - expected_bookmark_value = None - - self.assertEqual(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) - - def test_no_bookmark_yes_start_date(self, mocked_get_bookmark): - # Start date is present and bookmark is not present then start date should be return. - mocked_get_bookmark.return_value = None - start_date = '2021-04-01T00:00:00.000000Z' - bookmark_key = 'since' - expected_bookmark_value = '2021-04-01T00:00:00.000000Z' - - self.assertEqual(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) - - def test_yes_bookmark_yes_start_date(self, mocked_get_bookmark): - # Start date and bookmark both are present then bookmark should be return. - mocked_get_bookmark.return_value = {"since" : "2021-05-01T00:00:00.000000Z"} - start_date = '2021-04-01T00:00:00.000000Z' - bookmark_key = 'since' - expected_bookmark_value = '2021-05-01T00:00:00.000000Z' - - self.assertEqual(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) - - def test_yes_bookmark_no_start_date(self, mocked_get_bookmark): - # Start date is not present and bookmark is present then bookmark should be return. - mocked_get_bookmark.return_value = {"since" : "2021-05-01T00:00:00.000000Z"} - start_date = None - bookmark_key = 'since' - expected_bookmark_value = '2021-05-01T00:00:00.000000Z' - - self.assertEqual(expected_bookmark_value, tap_github.get_bookmark('', '', '', bookmark_key, start_date)) diff --git a/tests/unittests/test_stream.py b/tests/unittests/test_stream.py new file mode 100644 index 00000000..27cf49fa --- /dev/null +++ b/tests/unittests/test_stream.py @@ -0,0 +1,189 @@ +import unittest +from unittest import mock +from tap_github.streams import Comments, ProjectColumns, Projects, Reviews, TeamMemberships, Teams, PullRequests, get_schema, get_child_full_url, get_bookmark +from parameterized import parameterized + + +class TestGetSchema(unittest.TestCase): + """ + Test `get_schema` method of the stream class + """ + + def test_get_schema(self): + """Verify function returns expected schema""" + catalog = [ + {"tap_stream_id": "projects"}, + {"tap_stream_id": "comments"}, + {"tap_stream_id": "events"}, + ] + expected_schema = {"tap_stream_id": "comments"} + + # Verify returned schema is same as exected schema + self.assertEqual(get_schema(catalog, "comments"), expected_schema) + + +class TestGetBookmark(unittest.TestCase): + """ + Test `get_bookmark` method + """ + + test_stream = Comments() + + def test_with_out_repo_path(self): + """ + Test if the state does not contain a repo path + """ + state = { + "bookmarks": { + "projects": {"since": "2022-01-01T00:00:00Z"} + } + } + returned_bookmark = get_bookmark(state, "org/test-repo", "projects", "since", "2021-01-01T00:00:00Z") + self.assertEqual(returned_bookmark, "2021-01-01T00:00:00Z") + + def test_with_repo_path(self): + """ + Test if the state does contains a repo path + """ + state = { + "bookmarks": { + "org/test-repo": { + "projects": {"since": "2022-01-01T00:00:00Z"} + } + } + } + returned_bookmark = get_bookmark(state, "org/test-repo", "projects", "since", "2021-01-01T00:00:00Z") + self.assertEqual(returned_bookmark, "2022-01-01T00:00:00Z") + +class TestBuildUrl(unittest.TestCase): + """ + Test the `build_url` method of the stream class + """ + + @parameterized.expand([ + ["test_stream_with_filter_params", "org/test-repo", "https://api.github.com/repos/org/test-repo/issues/comments?sort=updated&direction=desc?since=2022-01-01T00:00:00Z", Comments], + ["test_stream_with_organization", "org", "https://api.github.com/orgs/org/teams", Teams] + ]) + def test_build_url(self, name, param, expected_url, stream_class): + """ + Test the `build_url` method for filter param or organization name only. + """ + test_streams = stream_class() + full_url = test_streams.build_url("https://api.github.com", param, "2022-01-01T00:00:00Z") + + # verify returned url is expected + self.assertEqual(expected_url, full_url) + + +class GetMinBookmark(unittest.TestCase): + """ + Test `get_min_bookmark` method of the stream class + """ + + start_date = "2020-04-01T00:00:00Z" + state = { + "bookmarks": { + "org/test-repo": { + "projects": {"since": "2022-03-29T00:00:00Z"}, + "project_columns": {"since": "2022-03-01T00:00:00Z"}, + "project_cards": {"since": "2022-03-14T00:00:00Z"}, + "pull_requests": {"since": "2022-04-01T00:00:00Z"}, + "review_comments": {"since": "2022-03-01T00:00:00Z"}, + "pr_commits": {"since": "2022-02-01T00:00:00Z"}, + "reviews": {"since": "2022-05-01T00:00:00Z"} + } + } + } + + @parameterized.expand([ + ["test_multiple_children", PullRequests, "pull_requests", ["pull_requests","review_comments", "pr_commits"], "2022-04-01T00:00:00Z", "2022-02-01T00:00:00Z"], + ["test_children_with_only_parent_selected", PullRequests, "pull_requests", ["pull_requests"], "2022-04-01T00:00:00Z", "2022-04-01T00:00:00Z"], + ["test_for_mid_child_in_stream", Projects, "projects", ["projects", "project_columns"], "2022-03-29T00:00:00Z", "2022-03-01T00:00:00Z"], + ["test_nested_child_bookmark", Projects, "projects", ["projects", "project_cards"], "2022-03-29T00:00:00Z", "2022-03-14T00:00:00Z"] + ]) + def test_multiple_children(self, name, stream_class, stream_name, stream_to_sync, current_date, expected_bookmark): + """ + Test that `get_min_bookmark` method returns the minimum bookmark from the parent and its corresponding child bookmarks. + """ + test_stream = stream_class() + bookmark = test_stream.get_min_bookmark(stream_name, stream_to_sync, + current_date, "org/test-repo", self.start_date, self.state) + + # Verify returned bookmark is expected + self.assertEqual(bookmark, expected_bookmark) + + +@mock.patch("singer.write_bookmark") +class TestWriteBookmark(unittest.TestCase): + """ + Test the `write_bookmarks` method of the stream class + """ + + state = { + "bookmarks": { + "org/test-repo": { + "projects": {"since": "2021-03-29T00:00:00Z"}, + "project_columns": {"since": "2021-03-01T00:00:00Z"}, + "project_cards": {"since": "2021-03-14T00:00:00Z"}, + "pull_requests": {"since": "2021-04-01T00:00:00Z"}, + "review_comments": {"since": "2021-03-01T00:00:00Z"}, + "pr_commits": {"since": "2021-02-01T00:00:00Z"}, + "reviews": {"since": "2021-05-01T00:00:00Z"} + } + } + } + + def test_multiple_child(self, mock_write_bookmark): + """ + Test for a stream with multiple children is selected + """ + test_stream = PullRequests() + test_stream.write_bookmarks("pull_requests", ["pull_requests","review_comments", "pr_commits"], + "2022-04-01T00:00:00Z", "org/test-repo", self.state) + + expected_calls = [ + mock.call(mock.ANY, mock.ANY, "pull_requests", {"since": "2022-04-01T00:00:00Z"}), + mock.call(mock.ANY, mock.ANY, "pr_commits", {"since": "2022-04-01T00:00:00Z"}), + mock.call(mock.ANY, mock.ANY, "review_comments", {"since": "2022-04-01T00:00:00Z"}), + ] + + # Verify `write_bookmark` is called for all selected streams + self.assertEqual(mock_write_bookmark.call_count, 3) + + self.assertIn(mock_write_bookmark.mock_calls[0], expected_calls) + self.assertIn(mock_write_bookmark.mock_calls[1], expected_calls) + self.assertIn(mock_write_bookmark.mock_calls[2], expected_calls) + + def test_nested_child(self, mock_write_bookmark): + """ + Test for the stream if the nested child is selected + """ + test_stream = Projects() + test_stream.write_bookmarks("projects", ["project_cards"], + "2022-04-01T00:00:00Z", "org/test-repo", self.state) + + # Verify `write_bookmark` is called for all selected streams + self.assertEqual(mock_write_bookmark.call_count, 1) + mock_write_bookmark.assert_called_with(mock.ANY, mock.ANY, + "project_cards", {"since": "2022-04-01T00:00:00Z"}) + + +class TestGetChildUrl(unittest.TestCase): + """ + Test `get_child_full_url` method of stream class + """ + domain = 'https://api.github.com' + + @parameterized.expand([ + ["test_child_stream", ProjectColumns, "org1/test-repo", "https://api.github.com/projects/1309875/columns", None, (1309875,)], + ["test_child_is_repository", Reviews, "org1/test-repo", "https://api.github.com/repos/org1/test-repo/pulls/11/reviews", (11,), None], + ["test_child_is_organization", TeamMemberships, "org1", "https://api.github.com/orgs/org1/teams/dev-team/memberships/demo-user-1", ("dev-team",), ("demo-user-1",)] + ]) + + def test_child_stream(self, name, stream_class, param, expected_url, parent_id, grand_parent_id): + """ + Test for a stream with one child + """ + child_stream = stream_class() + full_url = get_child_full_url(self.domain, child_stream, param, parent_id, grand_parent_id) + self.assertEqual(expected_url, full_url) diff --git a/tests/unittests/test_sub_streams_selection.py b/tests/unittests/test_sub_streams_selection.py deleted file mode 100644 index 8dd16ff9..00000000 --- a/tests/unittests/test_sub_streams_selection.py +++ /dev/null @@ -1,48 +0,0 @@ -import unittest -import tap_github.__init__ as tap_github - -class TestSubStreamSelection(unittest.TestCase): - - def test_pull_request_sub_streams_selected(self): - selected_streams = ["reviews", "pull_requests"] - self.assertIsNone(tap_github.validate_dependencies(selected_streams)) - - def test_pull_request_sub_streams_not_selected(self): - selected_streams = ["reviews", "pr_commits"] - try: - tap_github.validate_dependencies(selected_streams) - except tap_github.DependencyException as e: - self.assertEqual(str(e), "Unable to extract 'reviews' data, to receive 'reviews' data, you also need to select 'pull_requests'. Unable to extract 'pr_commits' data, to receive 'pr_commits' data, you also need to select 'pull_requests'.") - - def test_teams_sub_streams_selected(self): - selected_streams = ["teams", "team_members"] - self.assertIsNone(tap_github.validate_dependencies(selected_streams)) - - def test_teams_sub_streams_not_selected(self): - selected_streams = ["team_members"] - try: - tap_github.validate_dependencies(selected_streams) - except tap_github.DependencyException as e: - self.assertEqual(str(e), "Unable to extract 'team_members' data, to receive 'team_members' data, you also need to select 'teams'.") - - def test_projects_sub_streams_selected(self): - selected_streams = ["projects", "project_cards"] - self.assertIsNone(tap_github.validate_dependencies(selected_streams)) - - def test_projects_sub_streams_not_selected(self): - selected_streams = ["project_columns"] - try: - tap_github.validate_dependencies(selected_streams) - except tap_github.DependencyException as e: - self.assertEqual(str(e), "Unable to extract 'project_columns' data, to receive 'project_columns' data, you also need to select 'projects'.") - - def test_mixed_streams_positive(self): - selected_streams = ["pull_requests", "reviews", "collaborators", "team_members", "stargazers", "projects", "teams", "project_cards"] - self.assertIsNone(tap_github.validate_dependencies(selected_streams)) - - def test_mixed_streams_negative(self): - selected_streams = ["project_columns", "issues", "teams", "team_memberships", "projects", "releases", "review_comments"] - try: - tap_github.validate_dependencies(selected_streams) - except tap_github.DependencyException as e: - self.assertEqual(str(e), "Unable to extract 'review_comments' data, to receive 'review_comments' data, you also need to select 'pull_requests'.") diff --git a/tests/unittests/test_sync.py b/tests/unittests/test_sync.py new file mode 100644 index 00000000..ef22b7f7 --- /dev/null +++ b/tests/unittests/test_sync.py @@ -0,0 +1,168 @@ +import unittest +from unittest import mock +from tap_github.sync import sync, write_schemas + + + +def get_stream_catalog(stream_name, is_selected = False): + """Return catalog for stream""" + return { + "schema":{}, + "tap_stream_id": stream_name, + "metadata": [ + { + "breadcrumb": [], + "metadata":{ + "selected": is_selected + } + } + ], + "key_properties": [] + } + + +@mock.patch("singer.write_state") +@mock.patch("tap_github.sync.write_schemas") +@mock.patch("tap_github.streams.IncrementalStream.sync_endpoint") +class TestSyncFunctions(unittest.TestCase): + """ + Test `sync` function + """ + + @mock.patch("tap_github.streams.IncrementalOrderedStream.sync_endpoint") + def test_sync_all_parents(self, mock_inc_ordered, mock_incremental, mock_write_schemas, mock_write_state): + """ + Test sync function with only all parents selected + """ + + mock_catalog = {"streams": [ + get_stream_catalog("projects", True), + get_stream_catalog("pull_requests", True) + ]} + + client = mock.Mock() + client.extract_repos_from_config.return_value = (["test-repo"], set()) + client.authed_get_all_pages.return_value = [] + client.not_accessible_repos = {} + + sync(client, {'start_date': ""}, {}, mock_catalog) + + # Verify write schema is called for selected streams + self.assertEqual(mock_write_schemas.call_count, 2) + + self.assertEqual(mock_write_schemas.mock_calls[0], mock.call("projects", mock.ANY, mock.ANY)) + self.assertEqual(mock_write_schemas.mock_calls[1], mock.call("pull_requests", mock.ANY, mock.ANY)) + + @mock.patch("tap_github.streams.IncrementalOrderedStream.sync_endpoint") + def test_sync_only_child(self, mock_inc_ordered, mock_incremental, mock_write_schemas, mock_write_state): + """ + Test sync function with only all children selected + """ + + mock_catalog = {"streams": [ + get_stream_catalog("projects"), + get_stream_catalog("project_columns"), + get_stream_catalog("project_cards", True), + get_stream_catalog("pull_requests"), + get_stream_catalog("review_comments", True) + ]} + + client = mock.Mock() + client.extract_repos_from_config.return_value = (["test-repo"], {"org"}) + client.authed_get_all_pages.return_value = [] + client.not_accessible_repos = {} + + sync(client, {'start_date': "2019-01-01T00:00:00Z"}, {}, mock_catalog) + + # Verify write schema is called for selected streams + self.assertEqual(mock_write_schemas.call_count, 2) + + self.assertEqual(mock_write_schemas.mock_calls[0], mock.call("projects", mock.ANY, mock.ANY)) + self.assertEqual(mock_write_schemas.mock_calls[1], mock.call("pull_requests", mock.ANY, mock.ANY)) + + @mock.patch("tap_github.streams.FullTableStream.sync_endpoint") + def test_sync_only_mid_child(self, mock_full_table, mock_incremental, mock_write_schemas, mock_write_state): + """ + Test sync function with only all mid child selected + """ + + mock_catalog = {"streams": [ + get_stream_catalog("projects"), + get_stream_catalog("project_columns", True), + get_stream_catalog("project_cards"), + get_stream_catalog("teams"), + get_stream_catalog("team_members", True), + get_stream_catalog("team_memberships") + ]} + + client = mock.Mock() + client.extract_repos_from_config.return_value = (["test-repo"], {"org"}) + client.authed_get_all_pages.return_value = [] + client.not_accessible_repos = {} + + sync(client, {'start_date': ""}, {}, mock_catalog) + + # Verify write schema is called for selected streams + self.assertEqual(mock_write_schemas.call_count, 2) + + self.assertEqual(mock_write_schemas.mock_calls[0], mock.call("teams", mock.ANY, mock.ANY)) + self.assertEqual(mock_write_schemas.mock_calls[1], mock.call("projects", mock.ANY, mock.ANY)) + + @mock.patch("tap_github.sync.get_stream_to_sync", return_value = []) + @mock.patch("tap_github.sync.get_selected_streams", return_value = []) + @mock.patch("tap_github.sync.update_currently_syncing_repo") + def test_no_streams_selected(self, mock_update_curr_sync, mock_selected_streams, mock_sync_streams, + mock_incremental, mock_write_schemas, mock_write_state): + """ + Test if no streams are selected then the state does not update, + and `update_currently_syncing_repo` function is not called. + """ + + state = { + "currently_syncing_repo": "singer-io/test-repo", + "bookmarks": {}, + "currently_syncing": "teams" + } + mock_catalog = {"streams": [ + get_stream_catalog("projects"), + get_stream_catalog("project_columns", True), + get_stream_catalog("teams"), + get_stream_catalog("team_members", True) + ]} + + expected_state = { + "currently_syncing_repo": "singer-io/test-repo", + "bookmarks": {}, + "currently_syncing": "teams" + } + client = mock.Mock() + client.extract_repos_from_config.return_value = ["test-repo"], ["org1"] + sync(client, {'start_date': ""}, state, mock_catalog) + + # Verify state is not changed + self.assertEqual(state, expected_state) + + # Verify updated_currently_syncing_repo was not called + self.assertFalse(mock_update_curr_sync.called) + + +@mock.patch("singer.write_schema") +class TestWriteSchemas(unittest.TestCase): + + mock_catalog = {"streams": [ + get_stream_catalog("projects"), + get_stream_catalog("project_columns"), + get_stream_catalog("project_cards") + ]} + + def test_parents_selected(self, mock_write_schema): + write_schemas("projects", self.mock_catalog, ["projects"]) + mock_write_schema.assert_called_with("projects", mock.ANY, mock.ANY) + + def test_mid_child_selected(self, mock_write_schema): + write_schemas("project_columns", self.mock_catalog, ["project_columns"]) + mock_write_schema.assert_called_with("project_columns", mock.ANY, mock.ANY) + + def test_nested_child_selected(self, mock_write_schema): + write_schemas("project_cards", self.mock_catalog, ["project_cards"]) + mock_write_schema.assert_called_with("project_cards", mock.ANY, mock.ANY) diff --git a/tests/unittests/test_sync_endpoint.py b/tests/unittests/test_sync_endpoint.py new file mode 100644 index 00000000..338d9ea4 --- /dev/null +++ b/tests/unittests/test_sync_endpoint.py @@ -0,0 +1,289 @@ +import unittest +from unittest import mock +from tap_github.client import GithubClient +from tap_github.streams import Commits, Events, Projects, PullRequests, StarGazers, Teams + +class MockResponse(): + """Mock response object class.""" + def __init__(self, json_data): + self.json_data = json_data + + def json(self): + return self.json_data + +@mock.patch("tap_github.streams.get_schema") +@mock.patch("tap_github.client.GithubClient.verify_access_for_repo", return_value = None) +@mock.patch("tap_github.client.GithubClient.authed_get_all_pages") +class TestSyncEndpoints(unittest.TestCase): + + config = {"access_token": "", "repository": "singer-io/tap-github"} + catalog = {'schema': {}, "metadata": {}} + + @mock.patch("singer.write_record") + def test_sync_without_state(self, mock_write_records, mock_authed_all_pages, mock_verify_access, mock_get_schema): + """Verify that `write_records` is called for syncing stream endpoint.""" + + test_stream = Events() + mock_get_schema.return_value = self.catalog + mock_authed_all_pages.return_value = [MockResponse([{"id": 1, "created_at": "2019-01-01T00:00:00Z"}, + {"id": 2, "created_at": "2019-01-04T00:00:00Z"}]), + MockResponse([{"id": 3, "created_at": "2019-01-03T00:00:00Z"}, + {"id": 4, "created_at": "2019-01-02T00:00:00Z"}])] + expected_state = {'bookmarks': {'tap-github': {'events': {'since': '2019-01-04T00:00:00Z'}}}} + test_client = GithubClient(self.config) + final_state = test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "2018-01-02T00:00:00Z", ["events"], ['events']) + + # Verify returned state deom `sync_endpoint` + self.assertEqual(final_state, expected_state) + + # Verify `get_auth_all_pages` called with expected url + mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, stream='events') + + # Verify `write_records` call count + self.assertEqual(mock_write_records.call_count, 4) + + @mock.patch("singer.write_record") + def test_sync_with_state(self, mock_write_records, mock_authed_all_pages, mock_verify_access, mock_get_schema): + """Verify that `write_records` is called for records with replication value greater than bookmark.""" + + test_stream = Events() + mock_get_schema.return_value = self.catalog + mock_authed_all_pages.return_value = [MockResponse([{"id": 1, "created_at": "2019-01-01T00:00:00Z"}, + {"id": 2, "created_at": "2019-01-04T00:00:00Z"}]), + MockResponse([{"id": 3, "created_at": "2019-01-03T00:00:00Z"}, + {"id": 4, "created_at": "2019-01-02T00:00:00Z"}])] + mock_state = {'bookmarks': {'tap-github': {'events': {'since': '2019-01-02T00:00:00Z'}}}} + + expected_state = {'bookmarks': {'tap-github': {'events': {'since': '2019-01-04T00:00:00Z'}}}} + test_client = GithubClient(self.config) + final_state = test_stream.sync_endpoint(test_client, mock_state, self.catalog, "tap-github", "2018-01-02T00:00:00Z", ["events"], ['events']) + + # Verify returned state deom `sync_endpoint` + self.assertEqual(final_state, expected_state) + + # Verify `write_records` call count + self.assertEqual(mock_write_records.call_count, 3) + + # Verify `get_auth_all_pages` called with expected url + mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, stream='events') + mock_write_records.assert_called_with(mock.ANY, {'id': 4, 'created_at': '2019-01-02T00:00:00Z', '_sdc_repository': 'tap-github'},time_extracted = mock.ANY) + + +@mock.patch("tap_github.streams.get_schema") +@mock.patch("tap_github.client.GithubClient.verify_access_for_repo", return_value = None) +@mock.patch("tap_github.client.GithubClient.authed_get_all_pages") +class TestFullTable(unittest.TestCase): + """ + Test `sync_endpoint` for full table streams. + """ + config = {"access_token": "", "repository": "singer-io/tap-github"} + catalog = {"schema": {}, "metadata": {}} + + @mock.patch("tap_github.streams.Stream.get_child_records") + def test_without_child_stream(self, mock_get_child_records, mock_authed_get_all_pages, mock_verify_access, mock_get_schema): + """Verify that get_child_records() is not called for streams which do not have child streams""" + + test_client = GithubClient(self.config) + test_stream = StarGazers() + mock_get_schema.return_value = self.catalog + mock_authed_get_all_pages.return_value = [MockResponse([{"user": {"id": 1}}, {"user": {"id": 2}}]), + MockResponse([{"user": {"id": 4}}, {"user": {"id": 3}}])] + test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["stargazers"], ["stargazers"]) + + # Verify that the authed_get_all_pages() is called with the expected url + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/stargazers", mock.ANY, stream='stargazers') + + # Verify that the get_child_records() is not called as Stargazers doesn't have a child stream + self.assertFalse(mock_get_child_records.called) + + @mock.patch("tap_github.streams.Stream.get_child_records") + def test_with_child_streams(self, mock_get_child_records, mock_authed_get_all_pages, mock_verify_access, mock_get_schema): + """Verify that get_child_records() is called for streams with child streams""" + + test_client = GithubClient(self.config) + test_stream = Teams() + mock_get_schema.return_value = self.catalog + + mock_authed_get_all_pages.return_value = [MockResponse([{"id": 1, "slug": "s1"}, {"id": 2, "slug": "s2"}]), + MockResponse([{"id": 3, "slug": "s3"}, {"id": 4, "slug": "s4"}])] + + test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["teams", "team_members"], ["teams","team_members"]) + + # Verify that the authed_get_all_pages() is called with the expected url + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, stream='teams') + + # Verify that the get_child_records() is called + self.assertTrue(mock_get_child_records.called) + + def test_with_nested_child_streams(self, mock_authed_get_all_pages, mock_verify_access, mock_get_schema): + """Verify that get_child_records() is called for streams with child streams and calls authed_get_all_pages() is called as expected""" + + test_client = GithubClient(self.config) + test_stream = Teams() + mock_get_schema.return_value = self.catalog + + mock_authed_get_all_pages.side_effect = [ + [MockResponse([{"id": 1, "slug": "stitch-dev"}])], + [MockResponse([{"login": "log1"}, {"login": "log2"}])], + [MockResponse({"url": "u1"})], + [MockResponse({"url": "u3"})], + [], [] + ] + + test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["teams", "team_members", "team_memberships"], ["teams","team_members", "team_memberships"]) + + # Verify that the authed_get_all_pages() is called expected number of times + self.assertEqual(mock_authed_get_all_pages.call_count, 4) + + # Verify that the authed_get_all_pages() is called with the expected url + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, stream='teams') + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/members", stream='team_members') + exp_call_3 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/memberships/log1", stream='team_memberships') + + self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) + self.assertEqual(mock_authed_get_all_pages.mock_calls[1], exp_call_2) + self.assertEqual(mock_authed_get_all_pages.mock_calls[2], exp_call_3) + +@mock.patch("tap_github.streams.get_schema") +@mock.patch("tap_github.client.GithubClient.verify_access_for_repo", return_value = None) +@mock.patch("tap_github.client.GithubClient.authed_get_all_pages") +class TestIncrementalStream(unittest.TestCase): + """ + Test `sync_endpoint` for incremental streams. + """ + + config = {"access_token": "", "repository": "singer-io/tap-github"} + catalog = {"schema": {}, "metadata": {}} + + @mock.patch("tap_github.streams.Stream.get_child_records") + def test_without_child_stream(self, mock_get_child_records, mock_authed_get_all_pages, mock_verify_access, mock_get_schema): + """Verify that get_child_records() is not called for streams which do not have child streams""" + test_client = GithubClient(self.config) + test_stream = Commits() + mock_get_schema.return_value = self.catalog + mock_authed_get_all_pages.return_value = [MockResponse([{"commit": {"committer": {"date": "2022-07-05T09:42:14.000000Z"}}}, {"commit": {"committer": {"date": "2022-07-06T09:42:14.000000Z"}}}]), + MockResponse([{"commit": {"committer": {"date": "2022-07-07T09:42:14.000000Z"}}}, {"commit": {"committer": {"date": "2022-07-08T09:42:14.000000Z"}}}])] + test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["commits"], ["commits"]) + + # Verify that the authed_get_all_pages() is called with the expected url + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/commits?since=", mock.ANY, stream='commits') + + # Verify that the get_child_records() is not called as Commits does not contain any child stream. + self.assertFalse(mock_get_child_records.called) + + @mock.patch("tap_github.streams.Stream.get_child_records") + def test_with_child_streams(self, mock_get_child_records, mock_authed_get_all_pages, mock_verify_access, mock_get_schema): + """Verify that get_child_records() is called for streams with child streams""" + test_client = GithubClient(self.config) + test_stream = Projects() + mock_get_schema.return_value = self.catalog + + mock_authed_get_all_pages.return_value = [MockResponse([{"id": 1, "updated_at": "2022-07-05T09:42:14.000000Z"}, {"id": 1, "updated_at": "2022-07-06T09:42:14.000000Z"}]), + MockResponse([{"id": 1, "updated_at": "2022-07-07T09:42:14.000000Z"}, {"id": 1, "updated_at": "2022-07-08T09:42:14.000000Z"}])] + + test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["projects", "project_columns"], ["projects","project_columns"]) + + # Verify that the authed_get_all_pages() is called with the expected url + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects') + + # Verify that the get_child_records() is called as thw Projects stream has a child stream + self.assertTrue(mock_get_child_records.called) + + def test_with_nested_child_streams(self, mock_authed_get_all_pages, mock_verify_access, mock_get_schema): + """Verify that get_child_records() is called for streams with child streams and calls authed_get_all_pages() is called as expected""" + test_client = GithubClient(self.config) + test_stream = Projects() + mock_get_schema.return_value = self.catalog + + mock_authed_get_all_pages.side_effect = [ + [MockResponse([{"id": 1, "updated_at": "2022-07-05T09:42:14.000000Z"}])], + [MockResponse([{"id": 1}, {"id": 2}])], + [MockResponse({"id": 1})], + [MockResponse({"id": 2})], + [], [] + ] + + test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["projects", "project_columns", "project_cards"], ["projects","project_columns", "project_cards"]) + + # Verify that the authed_get_all_pages() is called expected number of times + self.assertEqual(mock_authed_get_all_pages.call_count, 4) + + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects') + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/projects/1/columns", stream='project_columns') + exp_call_3 = mock.call(mock.ANY, "https://api.github.com/projects/columns/1/cards", stream='project_cards') + + # Verify that the API calls are done as expected with the correct url + self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) + self.assertEqual(mock_authed_get_all_pages.mock_calls[1], exp_call_2) + self.assertEqual(mock_authed_get_all_pages.mock_calls[2], exp_call_3) + +@mock.patch("tap_github.streams.get_schema") +@mock.patch("tap_github.client.GithubClient.verify_access_for_repo", return_value = None) +@mock.patch("tap_github.client.GithubClient.authed_get_all_pages") +@mock.patch("tap_github.streams.singer.utils.strptime_to_utc") +class TestIncrementalOrderedStream(unittest.TestCase): + """ + Test `sync_endpoint` for incremental ordered streams. + """ + config = {"access_token": "", "repository": "singer-io/tap-github"} + catalog = {"schema": {}, "metadata": {}} + + @mock.patch("tap_github.streams.Stream.get_child_records") + def test_without_child_stream(self, mock_get_child_records, mock_strptime_to_utc, mock_authed_get_all_pages, mock_verify_access, mock_get_schema): + """Verify that get_child_records() is not called when child stream is not selected""" + test_client = GithubClient(self.config) + test_stream = PullRequests() + mock_strptime_to_utc.side_effect = ["2022-07-05 09:42:14", "2022-07-04 09:42:14"] + mock_get_schema.return_value = self.catalog + mock_authed_get_all_pages.return_value = [MockResponse([{"id": 1, "updated_at": "2022-07-05 09:42:14"}, {"id": 2, "updated_at": "2022-07-06 09:42:14"}]), + MockResponse([{"id": 3, "updated_at": "2022-07-07 09:42:14"}, {"id": 4, "updated_at": "2022-07-08 09:42:14"}])] + test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["pull_requests"], ["pull_requests"]) + + # Verify that the authed_get_all_pages() is called with the expected url + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests') + + + @mock.patch("tap_github.streams.Stream.get_child_records") + def test_with_child_streams(self, mock_get_child_records, mock_strptime_to_utc, mock_authed_get_all_pages, mock_verify_access, mock_get_schema): + """Verify that get_child_records() is called for streams with child streams""" + test_client = GithubClient(self.config) + test_stream = PullRequests() + mock_strptime_to_utc.side_effect = ["2022-07-05T09:42:14.000000Z", "2022-07-06T09:42:14.000000Z", "2022-07-05T09:42:14.000000Z", "2022-07-05T09:42:14.000000Z", "2022-07-05T09:42:14.000000Z"] + mock_get_schema.return_value = self.catalog + + mock_authed_get_all_pages.return_value = [MockResponse([{"id": 1, "number": 1, "updated_at": "2022-07-05T09:42:14.000000Z"}, {"id": 1, "number": 1, "updated_at": "2022-07-06T09:42:14.000000Z"}]), + MockResponse([{"id": 1, "number": 1, "updated_at": "2022-07-07T09:42:14.000000Z"}, {"id": 1, "number": 1, "updated_at": "2022-07-08T09:42:14.000000Z"}])] + + test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["pull_requests", "review_comments"], ["pull_requests","review_comments"]) + + # Verify that the authed_get_all_pages() is called with the expected url + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests') + + # Verify that the get_child_records() is called as the PullRequests stream has a child stream + self.assertTrue(mock_get_child_records.called) + + def test_with_nested_child_streams(self, mock_strptime_to_utc, mock_authed_get_all_pages, mock_verify_access, mock_get_schema): + """Verify that get_child_records() is called for streams with child streams and calls authed_get_all_pages() is called as expected""" + test_client = GithubClient(self.config) + test_stream = PullRequests() + mock_get_schema.return_value = self.catalog + mock_strptime_to_utc.side_effect = ["2022-07-05T09:42:14.000000Z", "2022-07-06T09:42:14.000000Z", "2022-07-06T09:42:14.000000Z"] + + mock_authed_get_all_pages.side_effect = [ + [MockResponse([{"id": 1, "number": 1, "updated_at": "2022-07-05T09:42:14.000000Z"}])], + [MockResponse([{"id": 1, "updated_at": "2022-07-06T09:42:14.000000Z"}, {"id": 2, "updated_at": "2022-07-06T09:42:14.000000Z"}])], + [], [] + ] + + test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["pull_requests", "review_comments"], ["pull_requests","review_comments"]) + + # Verify that the authed_get_all_pages() is called expected number of times + self.assertEqual(mock_authed_get_all_pages.call_count, 2) + + print(mock_authed_get_all_pages.mock_calls) + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests') + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls/1/comments?sort=updated_at&direction=desc", stream='review_comments') + + # Verify that the API calls are done as expected with the correct url + self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) + self.assertEqual(mock_authed_get_all_pages.mock_calls[1], exp_call_2) diff --git a/tests/unittests/test_timeout.py b/tests/unittests/test_timeout.py index ce9a4769..a3f6ca53 100644 --- a/tests/unittests/test_timeout.py +++ b/tests/unittests/test_timeout.py @@ -1,9 +1,13 @@ import unittest from unittest import mock -import tap_github.__init__ as tap_github +import tap_github +from tap_github.client import GithubClient, REQUEST_TIMEOUT import requests +from parameterized import parameterized class Mockresponse: + """ Mock response object class.""" + def __init__(self, status_code, json, raise_error, headers={'X-RateLimit-Remaining': 1}, text=None, content=None): self.status_code = status_code self.raise_error = raise_error @@ -18,19 +22,24 @@ def raise_for_status(self): raise requests.HTTPError("Sample message") def json(self): + """ Response JSON method.""" return self.text class MockParseArgs: + """Mock args object class""" config = {} def __init__(self, config): self.config = config def get_args(config): + """ Returns required args response. """ return MockParseArgs(config) def get_response(status_code, json={}, raise_error=False, content=None): + """ Returns required mock response. """ return Mockresponse(status_code, json, raise_error, content=content) +@mock.patch("tap_github.client.GithubClient.verify_access_for_repo", return_value = None) @mock.patch("time.sleep") @mock.patch("requests.Session.request") @mock.patch("singer.utils.parse_args") @@ -38,121 +47,41 @@ class TestTimeoutValue(unittest.TestCase): """ Test case to verify the timeout value is set as expected """ - - def test_timeout_value_in_config(self, mocked_parse_args, mocked_request, mocked_sleep): - json = {"key": "value"} - # mock response - mocked_request.return_value = get_response(200, json) - - mock_config = {"request_timeout": 100} - # mock parse args - mocked_parse_args.return_value = get_args(mock_config) - - # get the timeout value for assertion - timeout = tap_github.get_request_timeout() - # function call - tap_github.authed_get("test_source", "") - - # verify that we got expected timeout value - self.assertEquals(100.0, timeout) - # verify that the request was called with expected timeout value - mocked_request.assert_called_with(method='get', url='', timeout=100.0) - - def test_timeout_value_not_in_config(self, mocked_parse_args, mocked_request, mocked_sleep): - json = {"key": "value"} - # mock response - mocked_request.return_value = get_response(200, json) - - mock_config = {} - # mock parse args - mocked_parse_args.return_value = get_args(mock_config) - - # get the timeout value for assertion - timeout = tap_github.get_request_timeout() - # function call - tap_github.authed_get("test_source", "") - - # verify that we got expected timeout value - self.assertEquals(300.0, timeout) - # verify that the request was called with expected timeout value - mocked_request.assert_called_with(method='get', url='', timeout=300.0) - - def test_timeout_string_value_in_config(self, mocked_parse_args, mocked_request, mocked_sleep): - json = {"key": "value"} - # mock response - mocked_request.return_value = get_response(200, json) - - mock_config = {"request_timeout": "100"} - # mock parse args - mocked_parse_args.return_value = get_args(mock_config) - - # get the timeout value for assertion - timeout = tap_github.get_request_timeout() - # function call - tap_github.authed_get("test_source", "") - - # verify that we got expected timeout value - self.assertEquals(100.0, timeout) - # verify that the request was called with expected timeout value - mocked_request.assert_called_with(method='get', url='', timeout=100.0) - - def test_timeout_empty_value_in_config(self, mocked_parse_args, mocked_request, mocked_sleep): - json = {"key": "value"} - # mock response - mocked_request.return_value = get_response(200, json) - - mock_config = {"request_timeout": ""} - # mock parse args - mocked_parse_args.return_value = get_args(mock_config) - - # get the timeout value for assertion - timeout = tap_github.get_request_timeout() - # function call - tap_github.authed_get("test_source", "") - - # verify that we got expected timeout value - self.assertEquals(300.0, timeout) - # verify that the request was called with expected timeout value - mocked_request.assert_called_with(method='get', url='', timeout=300.0) - - def test_timeout_0_value_in_config(self, mocked_parse_args, mocked_request, mocked_sleep): - json = {"key": "value"} + json = {"key": "value"} + + @parameterized.expand([ + ["test_int_value", {"request_timeout": 100, "access_token": "access_token"}, 100.0], + ["test_str_value", {"request_timeout": "100", "access_token": "access_token"}, 100.0], + ["test_empty_value", {"request_timeout": "", "access_token": "access_token"}, 300.0], + ["test_int_zero_value", {"request_timeout": 0, "access_token": "access_token"}, 300.0], + ["test_str_zero_value", {"request_timeout": "0", "access_token": "access_token"}, 300.0], + ["test_no_value", {"request_timeout": "0", "access_token": "access_token"}, REQUEST_TIMEOUT] + + ]) + def test_timeout_value_in_config(self, mocked_parse_args, mocked_request, mocked_sleep, mock_verify_access, name, config, expected_value): + """ + Test if timeout value given in config + """ # mock response - mocked_request.return_value = get_response(200, json) + mocked_request.return_value = get_response(200, self.json) - mock_config = {"request_timeout": 0.0} + mock_config = config # mock parse args mocked_parse_args.return_value = get_args(mock_config) + test_client = GithubClient(mock_config) # get the timeout value for assertion - timeout = tap_github.get_request_timeout() + timeout = test_client.get_request_timeout() # function call - tap_github.authed_get("test_source", "") + test_client.authed_get("test_source", "") # verify that we got expected timeout value - self.assertEquals(300.0, timeout) + self.assertEqual(expected_value, timeout) # verify that the request was called with expected timeout value - mocked_request.assert_called_with(method='get', url='', timeout=300.0) + mocked_request.assert_called_with(method='get', url='', timeout=expected_value) - def test_timeout_string_0_value_in_config(self, mocked_parse_args, mocked_request, mocked_sleep): - json = {"key": "value"} - # mock response - mocked_request.return_value = get_response(200, json) - - mock_config = {"request_timeout": "0.0"} - # mock parse args - mocked_parse_args.return_value = get_args(mock_config) - - # get the timeout value for assertion - timeout = tap_github.get_request_timeout() - # function call - tap_github.authed_get("test_source", "") - - # verify that we got expected timeout value - self.assertEquals(300.0, timeout) - # verify that the request was called with expected timeout value - mocked_request.assert_called_with(method='get', url='', timeout=300.0) +@mock.patch("tap_github.client.GithubClient.verify_access_for_repo", return_value = None) @mock.patch("time.sleep") @mock.patch("requests.Session.request") @mock.patch("singer.utils.parse_args") @@ -161,36 +90,26 @@ class TestTimeoutAndConnnectionErrorBackoff(unittest.TestCase): Test case to verify that we backoff for 5 times for Connection and Timeout error """ - def test_timeout_backoff(self, mocked_parse_args, mocked_request, mocked_sleep): - # mock request and raise 'Timeout' error - mocked_request.side_effect = requests.Timeout - - mock_config = {} + @parameterized.expand([ + ["test_timeout_backoff", requests.Timeout], + ["test_connection_error_backoff", requests.ConnectionError] + ]) + def test_backoff(self, mocked_parse_args, mocked_request, mocked_sleep, mock_verify_access, name, error_class): + """ + Test that tap retry timeout or connection error 5 times. + """ + # mock request and raise error + mocked_request.side_effect = error_class + + mock_config = {"access_token": "access_token"} # mock parse args mocked_parse_args.return_value = get_args(mock_config) + test_client = GithubClient(mock_config) - try: - # function call - tap_github.authed_get("test_source", "") - except requests.Timeout: - pass + with self.assertRaises(error_class): + test_client.authed_get("test_source", "") # verify that we backoff 5 times - self.assertEquals(5, mocked_request.call_count) + self.assertEqual(5, mocked_request.call_count) - def test_connection_error_backoff(self, mocked_parse_args, mocked_request, mocked_sleep): - # mock request and raise 'Connection' error - mocked_request.side_effect = requests.ConnectionError - mock_config = {} - # mock parse args - mocked_parse_args.return_value = get_args(mock_config) - - try: - # function call - tap_github.authed_get("test_source", "") - except requests.ConnectionError: - pass - - # verify that we backoff 5 times - self.assertEquals(5, mocked_request.call_count) diff --git a/tests/unittests/test_verify_access.py b/tests/unittests/test_verify_access.py index 1e00df32..bdd93209 100644 --- a/tests/unittests/test_verify_access.py +++ b/tests/unittests/test_verify_access.py @@ -1,9 +1,12 @@ from unittest import mock import tap_github +from tap_github.client import GithubClient import unittest import requests class Mockresponse: + """ Mock response object class.""" + def __init__(self, status_code, json, raise_error, headers={'X-RateLimit-Remaining': 1}, text=None): self.status_code = status_code self.raise_error = raise_error @@ -18,110 +21,42 @@ def raise_for_status(self): raise requests.HTTPError("Sample message") def json(self): + """ Response JSON method.""" return self.text def get_response(status_code, json={}, raise_error=False): + """ Returns required mock response. """ return Mockresponse(status_code, json, raise_error) +@mock.patch("tap_github.client.GithubClient.verify_access_for_repo", return_value = None) @mock.patch("requests.Session.request") @mock.patch("singer.utils.parse_args") class TestCredentials(unittest.TestCase): + """ + Test `verify_repo_access` error handling + """ - def test_repo_not_found(self, mocked_parse_args, mocked_request): - json = {"message": "Not Found", "documentation_url": "https:/docs.github.com/"} - mocked_request.return_value = get_response(404, json, True) - - try: - tap_github.verify_repo_access("", "repo") - except tap_github.NotFoundException as e: - self.assertEqual(str(e), "HTTP-error-code: 404, Error: Please check the repository name 'repo' or you do not have sufficient permissions to access this repository.") + config = {"access_token": "", "repository": "singer-io/tap-github"} - def test_repo_bad_request(self, mocked_parse_args, mocked_request): + def test_repo_bad_request(self, mocked_parse_args, mocked_request, mock_verify_access): + """Verify if 400 error arises""" + test_client = GithubClient(self.config) mocked_request.return_value = get_response(400, raise_error = True) - try: - tap_github.verify_repo_access("", "repo") - except tap_github.BadRequestException as e: - self.assertEqual(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") - - def test_repo_bad_creds(self, mocked_parse_args, mocked_request): - json = {"message": "Bad credentials", "documentation_url": "https://docs.github.com/"} - mocked_request.return_value = get_response(401, json, True) - - try: - tap_github.verify_repo_access("", "repo") - except tap_github.BadCredentialsException as e: - self.assertEqual(str(e), "HTTP-error-code: 401, Error: {}".format(json)) - - @mock.patch("tap_github.get_catalog") - def test_discover_valid_creds(self, mocked_get_catalog, mocked_parse_args, mocked_request): - mocked_request.return_value = get_response(200) - mocked_get_catalog.return_value = {} - - tap_github.do_discover({"access_token": "access_token", "repository": "org/repo"}) - - self.assertTrue(mocked_get_catalog.call_count, 1) - - @mock.patch("tap_github.get_catalog") - def test_discover_not_found(self, mocked_get_catalog, mocked_parse_args, mocked_request): - json = {"message": "Not Found", "documentation_url": "https:/docs.github.com/"} - mocked_request.return_value = get_response(404, json, True) - mocked_get_catalog.return_value = {} + with self.assertRaises(tap_github.client.BadRequestException) as e: + test_client.verify_repo_access("", "repo") - try: - tap_github.do_discover({"access_token": "access_token", "repository": "org/repo"}) - except tap_github.NotFoundException as e: - self.assertEqual(str(e), "HTTP-error-code: 404, Error: Please check the repository name org/repo or you do not have sufficient permissions to access this repository.") - self.assertEqual(mocked_get_catalog.call_count, 1) + # Verify error with proper message + self.assertEqual(str(e.exception), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") - @mock.patch("tap_github.get_catalog") - def test_discover_bad_request(self, mocked_get_catalog, mocked_parse_args, mocked_request): - mocked_request.return_value = get_response(400, raise_error = True) - mocked_get_catalog.return_value = {} - - try: - tap_github.do_discover({"access_token": "access_token", "repository": "org/repo"}) - except tap_github.BadRequestException as e: - self.assertEqual(str(e), "HTTP-error-code: 400, Error: The request is missing or has a bad parameter.") - self.assertEqual(mocked_get_catalog.call_count, 0) - - @mock.patch("tap_github.get_catalog") - def test_discover_bad_creds(self, mocked_get_catalog, mocked_parse_args, mocked_request): - json = {"message":"Bad credentials","documentation_url":"https://docs.github.com/"} + def test_repo_bad_creds(self, mocked_parse_args, mocked_request, mock_verify_access): + """Verify if 401 error arises""" + test_client = GithubClient(self.config) + json = {"message": "Bad credentials", "documentation_url": "https://docs.github.com/"} mocked_request.return_value = get_response(401, json, True) - mocked_get_catalog.return_value = {} - - try: - tap_github.do_discover({"access_token": "access_token", "repository": "org/repo"}) - except tap_github.BadCredentialsException as e: - self.assertEqual(str(e), "HTTP-error-code: 401, Error: {}".format(json)) - self.assertEqual(mocked_get_catalog.call_count, 0) - - @mock.patch("tap_github.get_catalog") - def test_discover_forbidden(self, mocked_get_catalog, mocked_parse_args, mocked_request): - json = {'message': 'Must have admin rights to Repository.', 'documentation_url': 'https://docs.github.com/'} - mocked_request.return_value = get_response(403, json, True) - mocked_get_catalog.return_value = {} - - try: - tap_github.do_discover({"access_token": "access_token", "repository": "org/repo"}) - except tap_github.AuthException as e: - self.assertEqual(str(e), "HTTP-error-code: 403, Error: {}".format(json)) - self.assertEqual(mocked_get_catalog.call_count, 0) - - -@mock.patch("tap_github.logger.info") -@mock.patch("tap_github.verify_repo_access") -class TestRepoCallCount(unittest.TestCase): - def test_repo_call_count(self, mocked_repo, mocked_logger_info): - """ - Here 3 repos are given, - so tap will check creds for all 3 repos - """ - mocked_repo.return_value = None - config = {"access_token": "access_token", "repository": "org1/repo1 org1/repo2 org2/repo1"} - tap_github.verify_access_for_repo(config) + with self.assertRaises(tap_github.client.BadCredentialsException) as e: + test_client.verify_repo_access("", "repo") - self.assertEqual(mocked_logger_info.call_count, 3) - self.assertEqual(mocked_repo.call_count, 3) + # Verify error with proper message + self.assertEqual(str(e.exception), "HTTP-error-code: 401, Error: {}".format(json)) From ad178cd0932732f7798d519495bca6f020667cd9 Mon Sep 17 00:00:00 2001 From: Sourabh Gandhi <105213416+sgandhi1311@users.noreply.github.com> Date: Thu, 29 Sep 2022 15:10:54 +0530 Subject: [PATCH 06/30] major bump version 2.0.0 (#181) * major bump version 2.0.0 * update change logs * Modified changelog.md * Modified Changelog.md for schema updates Co-authored-by: Rushikesh Todkar <98420315+RushiT0122@users.noreply.github.com> --- CHANGELOG.md | 11 +++++++++++ setup.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f2e645ca..d84d7ad8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Changelog +# 2.0.0 + * Schema updates [#170](https://github.com/singer-io/tap-github/pull/170) [#169](https://github.com/singer-io/tap-github/pull/169) + * Update data types of fields in `events` and `issue_events` stream + * Add missing fields to the schemas + * Update dict based implementation to class based [#168](https://github.com/singer-io/tap-github/pull/168) + * Implement currently syncing for repos and streams [#171](https://github.com/singer-io/tap-github/pull/171) [#174](https://github.com/singer-io/tap-github/pull/174) + * Implement custom exception handling and backoff for 5xx error [#166](https://github.com/singer-io/tap-github/pull/166) + * Support of custom domain [#172](https://github.com/singer-io/tap-github/pull/172) + * Sync teams at organization level [#173](https://github.com/singer-io/tap-github/pull/173) + * Update integration test suite [#167](https://github.com/singer-io/tap-github/pull/167) + # 1.10.4 * Fix team_members stream primary Key [#157] (https://github.com/singer-io/tap-github/pull/157) diff --git a/setup.py b/setup.py index 4b191385..b6c06fef 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='1.10.4', + version='2.0.0', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', From 099df74b26da865e73fded5516f5926c4ee1851c Mon Sep 17 00:00:00 2001 From: Andy Lu Date: Tue, 9 May 2023 08:39:23 -0500 Subject: [PATCH 07/30] TDL-22816 allow discovery of commitless repos (#187) * Allow discovery of commitless repos * Allow `commits` stream sync to continue when we hit an empty repo * Revert "Allow discovery of commitless repos" This reverts commit efcd42fb4c7b8884671a80fef754d01ecb0fca8d. * Add a stream to the verify call so `raise_for_error` works * Make pylint happy * Fix bookmarks test by pinning window The tap syncs from the start_date/bookmark until now() and we don't have new test data on the test repo. This causes tests to fail after a while. The ability to shift the bookmark, on a per-stream basis, was already put in place so we use it to make the state work with the data we have. `Commits` only has two days of data, so we need to put the start date before the first day and the bookmark on the second day in order for the second sync to pull less data --------- Co-authored-by: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> * Fix pagination test by excluding streams without enough data --------- Co-authored-by: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> * Make the test log which version of the test failed --------- Co-authored-by: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> * Fix start_date test by specifying the date for `issues` and `pull_requests` --------- Co-authored-by: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> * Force `calculated_states_by_stream` to take start_date This was initially an optional param, but this is the only test to use the function, so we can update its signature everywhere --------- Co-authored-by: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> * TDL-22816 style changes (#188) [skip ci] * Whitespace clean up --------- Co-authored-by: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> * Params get their own lines --------- Co-authored-by: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> * Comments go above code --------- Co-authored-by: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> * Fix long lines --------- Co-authored-by: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> * Update logger to info [skip-ci] --------- Co-authored-by: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> --------- Co-authored-by: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> * Trigger CI --------- Co-authored-by: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> * Bump version, update changelog --------- Co-authored-by: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> --------- Co-authored-by: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> --- CHANGELOG.md | 3 +++ setup.py | 2 +- tap_github/client.py | 12 +++++++++-- tests/test_github_bookmarks.py | 34 ++++++++++++++++++++------------ tests/test_github_pagination.py | 35 +++++++++++++++++++++++---------- tests/test_github_start_date.py | 30 ++++++++++++++++++++-------- 6 files changed, 82 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d84d7ad8..1f9cd9ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# 2.0.1 + * Allow `commits` stream sync to continue when we hit an empty repo [#187](https://github.com/singer-io/tap-github/pull/187) + # 2.0.0 * Schema updates [#170](https://github.com/singer-io/tap-github/pull/170) [#169](https://github.com/singer-io/tap-github/pull/169) * Update data types of fields in `events` and `issue_events` stream diff --git a/setup.py b/setup.py index b6c06fef..de812020 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.0', + version='2.0.1', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', diff --git a/tap_github/client.py b/tap_github/client.py index 9913a8c2..c7f2a217 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -105,6 +105,10 @@ def raise_for_error(resp, source, stream, client, should_skip_404): except JSONDecodeError: response_json = {} + if stream == "commits" and response_json.get("message") == "Git Repository is empty.": + LOGGER.info("Encountered an empty git repository") + return None + if error_code == 404 and should_skip_404: # Add not accessible stream into list. client.not_accessible_repos.add(stream) @@ -199,8 +203,12 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True) raise_for_error(resp, source, stream, self, should_skip_404) timer.tags[metrics.Tag.http_status_code] = resp.status_code rate_throttling(resp, self.max_sleep_seconds) - if resp.status_code == 404: + if resp.status_code in {404, 409}: # Return an empty response body since we're not raising a NotFoundException + + # In the 409 case, this is only for `commits` returning an + # error for an empty repository, so we'll treat this as an + # empty list of records to process resp._content = b'{}' # pylint: disable=protected-access return resp @@ -224,7 +232,7 @@ def verify_repo_access(self, url_for_repo, repo): Call rest API to verify that the user has sufficient permissions to access this repository. """ try: - self.authed_get("verifying repository access", url_for_repo) + self.authed_get("verifying repository access", url_for_repo, stream="commits") except NotFoundException: # Throwing user-friendly error message as it checks token access message = "HTTP-error-code: 404, Error: Please check the repository name \'{}\' or you do not have sufficient permissions to access this repository.".format(repo) diff --git a/tests/test_github_bookmarks.py b/tests/test_github_bookmarks.py index 9e2c4135..d40372d2 100644 --- a/tests/test_github_bookmarks.py +++ b/tests/test_github_bookmarks.py @@ -14,15 +14,18 @@ class TestGithubBookmarks(TestGithubBase): def name(): return "tap_tester_github_bookmarks" - def calculated_states_by_stream(self, current_state, synced_records, replication_keys): + def calculated_states_by_stream(self, current_state, synced_records, replication_keys, start_date): """ - Look at the bookmarks from a previous sync and set a new bookmark - value based off timedelta expectations. This ensures the subsequent sync will replicate - at least 1 record but, fewer records than the previous sync. + Look at the bookmarks from a previous sync and shift it to a + date to ensure the subsequent sync will replicate at least 1 + record but, fewer records than the previous sync. """ - timedelta_by_stream = {stream: [90,0,0] # {stream_name: [days, hours, minutes], ...} + # {stream_name: [days, hours, minutes], ...} + timedelta_by_stream = {stream: [90,0,0] for stream in self.expected_streams()} + timedelta_by_stream["commits"] = [7, 0, 0] + repo = self.get_properties().get('repository') stream_to_calculated_state = {repo: {stream: "" for stream in current_state['bookmarks'][repo].keys()}} @@ -31,7 +34,9 @@ def calculated_states_by_stream(self, current_state, synced_records, replication state_as_datetime = dateutil.parser.parse(state_value) days, hours, minutes = timedelta_by_stream[stream] - calculated_state_as_datetime = state_as_datetime - datetime.timedelta(days=days, hours=hours, minutes=minutes) + + start_date_as_datetime = dateutil.parser.parse(start_date) + calculated_state_as_datetime = start_date_as_datetime + datetime.timedelta(days=days, hours=hours, minutes=minutes) state_format = '%Y-%m-%dT%H:%M:%SZ' calculated_state_formatted = datetime.datetime.strftime(calculated_state_as_datetime, state_format) @@ -49,7 +54,7 @@ def test_run(self): All data of the second sync is >= the bookmark from the first sync The number of records in the 2nd sync is less then the first • Verify that for full table stream, all data replicated in sync 1 is replicated again in sync 2. - + PREREQUISITE For EACH stream that is incrementally replicated there are multiple rows of data with different values for the replication key @@ -83,9 +88,12 @@ def test_run(self): ### Update State Between Syncs ########################################################################## + first_sync_start_date = self.get_properties()['start_date'] new_states = {'bookmarks': dict()} simulated_states = self.calculated_states_by_stream(first_sync_bookmarks, - first_sync_records, expected_replication_keys) + first_sync_records, + expected_replication_keys, + first_sync_start_date) for repo, new_state in simulated_states.items(): new_states['bookmarks'][repo] = new_state menagerie.set_state(conn_id, new_states) @@ -126,7 +134,7 @@ def test_run(self): replication_key = next(iter(expected_replication_keys[stream])) first_bookmark_value = first_bookmark_key_value.get('since') second_bookmark_value = second_bookmark_key_value.get('since') - + first_bookmark_value_ts = self.dt_to_ts(first_bookmark_value, self.BOOKMARK_FORMAT) second_bookmark_value_ts = self.dt_to_ts(second_bookmark_value, self.BOOKMARK_FORMAT) @@ -147,11 +155,11 @@ def test_run(self): # For events stream replication key value is coming in different format if stream == 'events': replication_key_format = self.EVENTS_RECORD_REPLICATION_KEY_FORMAT - + for record in first_sync_messages: # Verify the first sync bookmark value is the max replication key value for a given stream replication_key_value = self.dt_to_ts(record.get(replication_key), replication_key_format) - + self.assertLessEqual( replication_key_value, first_bookmark_value_ts, msg="First sync bookmark was set incorrectly, a record with a greater replication-key value was synced." @@ -160,10 +168,10 @@ def test_run(self): for record in second_sync_messages: # Verify the second sync bookmark value is the max replication key value for a given stream replication_key_value = self.dt_to_ts(record.get(replication_key), replication_key_format) - + self.assertGreaterEqual(replication_key_value, simulated_bookmark_value, msg="Second sync records do not respect the previous bookmark.") - + self.assertLessEqual( replication_key_value, second_bookmark_value_ts, msg="Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced." diff --git a/tests/test_github_pagination.py b/tests/test_github_pagination.py index 06a24abd..f0ec3196 100644 --- a/tests/test_github_pagination.py +++ b/tests/test_github_pagination.py @@ -23,30 +23,45 @@ def get_properties(self, original: bool = True): return return_value def test_run(self): - + streams_to_test = self.expected_streams() # Pagination is not supported for "team_memberships" by Github API. # Skipping "teams" stream as it's RECORD count is <= 30. - untestable_streams = {'team_memberships', 'teams'} + untestable_streams = { + 'team_memberships', + 'teams', + 'team_members', + 'collaborators', + 'assignees', + } - # For some streams RECORD count were not > 30 in same test-repo. + # For some streams RECORD count were not > 30 in same test-repo. # So, separated streams on the basis of RECORD count. self.repository_name = 'singer-io/tap-github' - expected_stream_1 = {'comments', 'stargazers', 'commits', 'pull_requests', 'reviews', 'review_comments', 'pr_commits', 'issues'} + expected_stream_1 = { + 'comments', + 'stargazers', + 'commits', + 'pull_requests', + 'reviews', + 'review_comments', + 'pr_commits', + 'issues', + } self.run_test(expected_stream_1) - + self.repository_name = 'singer-io/test-repo' expected_stream_2 = streams_to_test - expected_stream_1 - untestable_streams self.run_test(expected_stream_2) - + def run_test(self, streams): """ - • Verify that for each stream you can get multiple pages of data. + • Verify that for each stream you can get multiple pages of data. This requires we ensure more than 1 page of data exists at all times for any given stream. • Verify by pks that the data replicated matches the data we expect. """ - + # Page size for pagination supported streams page_size = 30 conn_id = connections.ensure_connection(self) @@ -83,7 +98,7 @@ def run_test(self, streams): # Verify that for each stream you can get multiple pages of data self.assertGreater(record_count_sync, page_size, msg="The number of records is not over the stream max limit") - + # Chunk the replicated records (just primary keys) into expected pages pages = [] page_count = ceil(len(primary_keys_list) / page_size) @@ -102,4 +117,4 @@ def run_test(self, streams): self.assertTrue( current_page.isdisjoint(other_page), msg=f'other_page_primary_keys={other_page}' - ) \ No newline at end of file + ) diff --git a/tests/test_github_start_date.py b/tests/test_github_start_date.py index 5ea10ced..a37fa43c 100644 --- a/tests/test_github_start_date.py +++ b/tests/test_github_start_date.py @@ -42,13 +42,27 @@ def test_run(self): self.run_test(date_1, date_2, expected_stream_2) date_2 = '2022-05-06T00:00:00Z' - expected_stream_3 = {'pull_requests', 'pr_commits', 'review_comments', 'reviews'} + expected_stream_3 = {'pr_commits', 'review_comments', 'reviews'} self.run_test(date_1, date_2, expected_stream_3) date_2 = '2022-01-27T00:00:00Z' + expected_stream_4 = self.expected_streams().difference( + expected_stream_1, + expected_stream_2, + expected_stream_3, + {'events', 'issues', 'pull_requests'} + ) + # run the test for all the streams excluding 'events' stream - # as for 'events' stream we have to use dynamic dates - self.run_test(date_1, date_2, self.expected_streams() - expected_stream_1 - expected_stream_2 - expected_stream_3 - {'events'}) + # as for 'events' stream we have to use dynamic dates. + # `issues` doesn't have enough data in this range, so we skip it too + self.run_test(date_1, date_2, expected_stream_4) + + date_3 = '2023-01-27T00:00:00Z' + self.run_test(date_1, date_3, {"issues"}) + + date_4 = '2023-01-01T00:00:00Z' + self.run_test(date_1, date_4, {'pull_requests'}) # As per the Documentation: https://docs.github.com/en/rest/reference/activity#events # the 'events' of past 90 days will only be returned @@ -60,7 +74,7 @@ def test_run(self): self.run_test(date_1, date_2, {'events'}) def run_test(self, date_1, date_2, streams): - """ + """ • Verify that a sync with a later start date has at least one record synced and less records than the 1st sync with a previous start date • Verify that each stream has less records than the earlier start date sync @@ -89,7 +103,7 @@ def run_test(self, date_1, date_2, streams): # run check mode found_catalogs_1 = self.run_and_verify_check_mode(conn_id_1) - + # table and field selection test_catalogs_1_all_fields = [catalog for catalog in found_catalogs_1 if catalog.get('stream_name') in expected_streams] @@ -130,7 +144,7 @@ def run_test(self, date_1, date_2, streams): self.assertGreater(sum(record_count_by_stream_1.values()), sum(record_count_by_stream_2.values())) for stream in expected_streams: - with self.subTest(stream=stream): + with self.subTest(stream=stream, start_date_1=date_1, start_date_2=date_2): # expected values expected_primary_keys = self.expected_primary_keys()[stream] @@ -154,7 +168,7 @@ def run_test(self, date_1, date_2, streams): self.assertGreater(record_count_sync_2, 0) if expected_metadata.get(self.OBEYS_START_DATE): - + # Expected bookmark key is one element in set so directly access it bookmark_keys_list_1 = [message.get('data').get(next(iter(expected_bookmark_keys))) for message in synced_records_1.get(stream).get('messages') if message.get('action') == 'upsert'] @@ -195,7 +209,7 @@ def run_test(self, date_1, date_2, streams): self.assertTrue(primary_keys_sync_2.issubset(primary_keys_sync_1)) else: - + # Verify that the 2nd sync with a later start date replicates the same number of # records as the 1st sync. self.assertEqual(record_count_sync_2, record_count_sync_1) From 7013274ec27b02c67300544e585fe6578715569b Mon Sep 17 00:00:00 2001 From: Sourabh Gandhi <105213416+sgandhi1311@users.noreply.github.com> Date: Mon, 15 May 2023 17:15:02 +0530 Subject: [PATCH 08/30] TDL-22921 Fix the api limit error (#190) * to avoid api rate limit error, tap will sleep for the seconds mentioned in header - X-RateLimit-Remaining * recursively call the function(afterwards) if the tap is paused for sometime. * fix the existing unit tests * fixed pylint issue * update comments * setup and changelog --- CHANGELOG.md | 3 +++ setup.py | 2 +- tap_github/client.py | 27 +++++++++++++-------------- tests/unittests/test_rate_limit.py | 27 +++++---------------------- 4 files changed, 22 insertions(+), 37 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f9cd9ed..040bfa3d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# 2.0.2 + * Make the tap sleep for `X-RateLimit-Reset` + `2` seconds, whenever the API rate limit is hit [#187](https://github.com/singer-io/tap-github/pull/187) + # 2.0.1 * Allow `commits` stream sync to continue when we hit an empty repo [#187](https://github.com/singer-io/tap-github/pull/187) diff --git a/setup.py b/setup.py index de812020..6d9ba88d 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.1', + version='2.0.2', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', diff --git a/tap_github/client.py b/tap_github/client.py index c7f2a217..fe6b3de4 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -6,7 +6,6 @@ from singer import metrics LOGGER = singer.get_logger() -DEFAULT_SLEEP_SECONDS = 600 DEFAULT_DOMAIN = "https://api.github.com" # Set default timeout of 300 seconds @@ -136,24 +135,23 @@ def calculate_seconds(epoch): current = time.time() return int(round((epoch - current), 0)) -def rate_throttling(response, max_sleep_seconds): +def rate_throttling(response): """ For rate limit errors, get the remaining time before retrying and calculate the time to sleep before making a new request. """ if 'X-RateLimit-Remaining' in response.headers: if int(response.headers['X-RateLimit-Remaining']) == 0: seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset'])) - - if seconds_to_sleep > max_sleep_seconds: - message = "API rate limit exceeded, please try after {} seconds.".format(seconds_to_sleep) - raise RateLimitExceeded(message) from None - LOGGER.info("API rate limit exceeded. Tap will retry the data collection after %s seconds.", seconds_to_sleep) - time.sleep(seconds_to_sleep) - else: - # Raise an exception if `X-RateLimit-Remaining` is not found in the header. - # API does include this key header if provided base URL is not a valid github custom domain. - raise GithubException("The API call using the specified base url was unsuccessful. Please double-check the provided base URL.") + # add the buffer 2 seconds + time.sleep(seconds_to_sleep + 2) + #returns True if tap sleeps + return True + return False + + # Raise an exception if `X-RateLimit-Remaining` is not found in the header. + # API does include this key header if provided base URL is not a valid github custom domain. + raise GithubException("The API call using the specified base url was unsuccessful. Please double-check the provided base URL.") class GithubClient: """ @@ -163,7 +161,6 @@ def __init__(self, config): self.config = config self.session = requests.Session() self.base_url = config['base_url'] if config.get('base_url') else DEFAULT_DOMAIN - self.max_sleep_seconds = self.config.get('max_sleep_seconds', DEFAULT_SLEEP_SECONDS) self.set_auth_in_session() self.not_accessible_repos = set() @@ -199,10 +196,12 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True) with metrics.http_request_timer(source) as timer: self.session.headers.update(headers) resp = self.session.request(method='get', url=url, timeout=self.get_request_timeout()) + if rate_throttling(resp): + # If the API rate limit is reached, the function will be recursively + self.authed_get(source, url, headers, stream, should_skip_404) if resp.status_code != 200: raise_for_error(resp, source, stream, self, should_skip_404) timer.tags[metrics.Tag.http_status_code] = resp.status_code - rate_throttling(resp, self.max_sleep_seconds) if resp.status_code in {404, 409}: # Return an empty response body since we're not raising a NotFoundException diff --git a/tests/unittests/test_rate_limit.py b/tests/unittests/test_rate_limit.py index 987c60a0..a10525d8 100644 --- a/tests/unittests/test_rate_limit.py +++ b/tests/unittests/test_rate_limit.py @@ -19,7 +19,7 @@ class TestRateLimit(unittest.TestCase): def test_rate_limt_wait(self, mocked_sleep): """ - Test `rate_throttling` for 'sleep_time' less than `MAX_SLEEP_SECONDS` + Test `rate_throttling` for 'sleep_time' """ mocked_sleep.side_effect = None @@ -28,30 +28,13 @@ def test_rate_limt_wait(self, mocked_sleep): resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 120 resp.headers["X-RateLimit-Remaining"] = 0 - rate_throttling(resp, DEFAULT_SLEEP_SECONDS) + rate_throttling(resp) # Verify `time.sleep` is called with expected seconds in response - mocked_sleep.assert_called_with(120) + mocked_sleep.assert_called_with(122) self.assertTrue(mocked_sleep.called) - def test_rate_limit_exception(self, mocked_sleep): - """ - Test `rate_throttling` for 'sleep_time' greater than `MAX_SLEEP_SECONDS` - """ - - mocked_sleep.side_effect = None - - resp = api_call() - resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 601 - resp.headers["X-RateLimit-Remaining"] = 0 - - # Verify exception is raised with proper message - with self.assertRaises(tap_github.client.RateLimitExceeded) as e: - rate_throttling(resp, DEFAULT_SLEEP_SECONDS) - self.assertEqual(str(e.exception), "API rate limit exceeded, please try after 601 seconds.") - - def test_rate_limit_not_exceeded(self, mocked_sleep): """ Test `rate_throttling` if sleep time does not exceed limit @@ -63,7 +46,7 @@ def test_rate_limit_not_exceeded(self, mocked_sleep): resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 10 resp.headers["X-RateLimit-Remaining"] = 5 - rate_throttling(resp, DEFAULT_SLEEP_SECONDS) + rate_throttling(resp) # Verify that `time.sleep` is not called self.assertFalse(mocked_sleep.called) @@ -76,7 +59,7 @@ def test_rate_limt_header_not_found(self, mocked_sleep): resp.headers={} with self.assertRaises(GithubException) as e: - rate_throttling(resp, DEFAULT_SLEEP_SECONDS) + rate_throttling(resp) # Verifying the message formed for the invalid base URL self.assertEqual(str(e.exception), "The API call using the specified base url was unsuccessful. Please double-check the provided base URL.") From 371e962a4b0eec7be6eb7d92c48cc87c245a6277 Mon Sep 17 00:00:00 2001 From: Sourabh Gandhi <105213416+sgandhi1311@users.noreply.github.com> Date: Tue, 16 May 2023 00:03:49 +0530 Subject: [PATCH 09/30] handle the secondary rate limit (#191) --- CHANGELOG.md | 5 ++++- setup.py | 2 +- tap_github/client.py | 7 +++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 040bfa3d..4182f874 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,10 @@ # Changelog +# 2.0.3 + * Handles the secondary rate limit - `Retry-After` [#191](https://github.com/singer-io/tap-github/pull/191) + # 2.0.2 - * Make the tap sleep for `X-RateLimit-Reset` + `2` seconds, whenever the API rate limit is hit [#187](https://github.com/singer-io/tap-github/pull/187) + * Make the tap sleep for `X-RateLimit-Reset` + `2` seconds, whenever the API rate limit is hit [#190](https://github.com/singer-io/tap-github/pull/190) # 2.0.1 * Allow `commits` stream sync to continue when we hit an empty repo [#187](https://github.com/singer-io/tap-github/pull/187) diff --git a/setup.py b/setup.py index 6d9ba88d..151dbf9c 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.2', + version='2.0.3', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', diff --git a/tap_github/client.py b/tap_github/client.py index fe6b3de4..c7d7f880 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -139,6 +139,13 @@ def rate_throttling(response): """ For rate limit errors, get the remaining time before retrying and calculate the time to sleep before making a new request. """ + if "Retry-After" in response.headers: + # handles the secondary rate limit + seconds_to_sleep = int(response.headers['Retry-After']) + LOGGER.info("API rate limit exceeded. Tap will retry the data collection after %s seconds.", seconds_to_sleep) + time.sleep(seconds_to_sleep) + #returns True if tap sleeps + return True if 'X-RateLimit-Remaining' in response.headers: if int(response.headers['X-RateLimit-Remaining']) == 0: seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset'])) From 587949b57ff6a0f934b956b8543386e87892eb10 Mon Sep 17 00:00:00 2001 From: Sourabh Gandhi <105213416+sgandhi1311@users.noreply.github.com> Date: Tue, 16 May 2023 15:55:27 +0530 Subject: [PATCH 10/30] Recursively call the function if `Retry-After` has the value greater than 0 (#192) --- CHANGELOG.md | 3 +++ setup.py | 2 +- tap_github/client.py | 9 +++++---- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4182f874..760a29af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# 2.0.4 + * Recursively call the function if `Retry-After` has the value greater than 0 [#192](https://github.com/singer-io/tap-github/pull/192) + # 2.0.3 * Handles the secondary rate limit - `Retry-After` [#191](https://github.com/singer-io/tap-github/pull/191) diff --git a/setup.py b/setup.py index 151dbf9c..e19b70d8 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.3', + version='2.0.4', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', diff --git a/tap_github/client.py b/tap_github/client.py index c7d7f880..9e846282 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -142,10 +142,11 @@ def rate_throttling(response): if "Retry-After" in response.headers: # handles the secondary rate limit seconds_to_sleep = int(response.headers['Retry-After']) - LOGGER.info("API rate limit exceeded. Tap will retry the data collection after %s seconds.", seconds_to_sleep) - time.sleep(seconds_to_sleep) - #returns True if tap sleeps - return True + if seconds_to_sleep > 0: + LOGGER.info("API rate limit exceeded. Tap will retry the data collection after %s seconds.", seconds_to_sleep) + time.sleep(seconds_to_sleep) + #returns True if tap sleeps + return True if 'X-RateLimit-Remaining' in response.headers: if int(response.headers['X-RateLimit-Remaining']) == 0: seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset'])) From 3e4565121670261b8b809611ed24e666399d105c Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 29 Jun 2023 09:34:59 -0300 Subject: [PATCH 11/30] Fix incorrect format of discussion_url field in releases schema (#196) --- tap_github/schemas/releases.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tap_github/schemas/releases.json b/tap_github/schemas/releases.json index b903a026..fc97ced2 100644 --- a/tap_github/schemas/releases.json +++ b/tap_github/schemas/releases.json @@ -186,8 +186,7 @@ "format": "date-time" }, "discussion_url": { - "type": ["null", "string"], - "format": "date-time" + "type": ["null", "string"] } } } \ No newline at end of file From daf14c29bd22fb8ebdebe272ad297080063408e3 Mon Sep 17 00:00:00 2001 From: Sourabh Gandhi <105213416+sgandhi1311@users.noreply.github.com> Date: Thu, 29 Jun 2023 21:39:34 +0530 Subject: [PATCH 12/30] setup and changelog modified (#197) --- CHANGELOG.md | 3 +++ setup.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 760a29af..ba99194d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# 2.0.5 + * Remove date-time format from the field discussion_url in releases schema [#196](https://github.com/singer-io/tap-github/pull/196) + # 2.0.4 * Recursively call the function if `Retry-After` has the value greater than 0 [#192](https://github.com/singer-io/tap-github/pull/192) diff --git a/setup.py b/setup.py index e19b70d8..79890701 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.4', + version='2.0.5', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', From dc309b65e66e5a88355c546e81540280d7c7b4ab Mon Sep 17 00:00:00 2001 From: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> Date: Tue, 12 Sep 2023 13:36:56 -0400 Subject: [PATCH 13/30] TDL-239782 Remove `files` and `stats` fields from commit-related schemas (#198) * Remove files and stats fields from commit-related schemas / tests * Changelog / version bump --- CHANGELOG.md | 5 +++ setup.py | 2 +- tap_github/schemas/commits.json | 46 ------------------------- tap_github/schemas/pr_commits.json | 54 +----------------------------- tests/test_github_all_fields.py | 6 ---- 5 files changed, 7 insertions(+), 106 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ba99194d..e3b3e962 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +# 2.0.6 + * Remove `files` and `stats` fields from `commits` endpoint as they are not returned without fetching individual commmits [#198](https://github.com/singer-io/tap-github/pull/198) + * Remove `files` and `stats` fields from `pr-commits` endpoint as they are not documented and not returned + * Update tests accordingly + # 2.0.5 * Remove date-time format from the field discussion_url in releases schema [#196](https://github.com/singer-io/tap-github/pull/196) diff --git a/setup.py b/setup.py index 79890701..2271e6ea 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.5', + version='2.0.6', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', diff --git a/tap_github/schemas/commits.json b/tap_github/schemas/commits.json index cf873448..00f0d2c8 100644 --- a/tap_github/schemas/commits.json +++ b/tap_github/schemas/commits.json @@ -44,38 +44,6 @@ } } }, - "files": { - "type": ["null", "array"], - "items": { - "type": ["null", "object"], - "properties": { - "filename": { - "type": ["null", "string"] - }, - "additions": { - "type": ["null", "number"] - }, - "deletions": { - "type": ["null", "number"] - }, - "changes": { - "type": ["null", "number"] - }, - "status": { - "type": ["null", "string"] - }, - "raw_url": { - "type": ["null", "string"] - }, - "blob_url": { - "type": ["null", "string"] - }, - "patch": { - "type": ["null", "string"] - } - } - } - }, "html_url": { "type": ["null", "string"] }, @@ -258,20 +226,6 @@ }, "author": { "$ref": "shared/user.json#/" - }, - "stats": { - "type": ["null", "object"], - "properties": { - "additions": { - "type": ["null", "integer"] - }, - "deletions": { - "type": ["null", "integer"] - }, - "total": { - "type": ["null", "integer"] - } - } } }, "additionalProperties": false diff --git a/tap_github/schemas/pr_commits.json b/tap_github/schemas/pr_commits.json index f4fa2f82..1108cfb3 100644 --- a/tap_github/schemas/pr_commits.json +++ b/tap_github/schemas/pr_commits.json @@ -35,44 +35,6 @@ } } }, - "files": { - "type": ["null","array"], - "items": { - "type": ["null","object"], - "properties": { - "filename": { - "type": ["null","string"] - }, - "additions": { - "type": ["null","number"] - }, - "deletions": { - "type": ["null","number"] - }, - "changes": { - "type": ["null","number"] - }, - "status": { - "type": ["null","string"] - }, - "raw_url": { - "type": ["null","string"] - }, - "blob_url": { - "type": ["null","string"] - }, - "contents_url": { - "type": ["null","string"] - }, - "sha": { - "type": ["null","string"] - }, - "patch": { - "type": ["null","string"] - } - } - } - }, "html_url": { "type": ["null", "string"], "description": "The HTML URL to the commit" @@ -301,23 +263,9 @@ "author": { "$ref": "shared/user.json#/" }, - "stats": { - "type": ["null", "object"], - "properties": { - "additions": { - "type": ["null", "integer"] - }, - "deletions": { - "type": ["null", "integer"] - }, - "total": { - "type": ["null", "integer"] - } - } - }, "updated_at": { "type": ["null", "string"], "format": "date-time" } } -} \ No newline at end of file +} diff --git a/tests/test_github_all_fields.py b/tests/test_github_all_fields.py index 305a9151..1fbb722e 100644 --- a/tests/test_github_all_fields.py +++ b/tests/test_github_all_fields.py @@ -20,15 +20,9 @@ 'project_id' }, 'commits': { - 'files', 'pr_id', 'id', 'pr_number', - 'stats', - }, - 'pr_commits': { - 'files', - 'stats' }, 'review_comments': { 'assignees', From 90abb48b134eb051b61d2c7b8768b690b35b000f Mon Sep 17 00:00:00 2001 From: bryantgray Date: Thu, 26 Oct 2023 10:15:45 -0400 Subject: [PATCH 14/30] TDL-24380 - Don't rely on dictionary order to grab PKs (#199) * Don't rely on dictionary order to grab PKs * Get PK from metadata map * version bump and changelog update Co-authored-by: dsprayberry --- CHANGELOG.md | 3 +++ setup.py | 2 +- tap_github/discover.py | 6 ++++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e3b3e962..d31fc8f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# 3.0.0 + * Allow all python versions to grab the correct key_properties/PK value [#199](https://github.com/singer-io/tap-github/pull/199) + # 2.0.6 * Remove `files` and `stats` fields from `commits` endpoint as they are not returned without fetching individual commmits [#198](https://github.com/singer-io/tap-github/pull/198) * Remove `files` and `stats` fields from `pr-commits` endpoint as they are not documented and not returned diff --git a/setup.py b/setup.py index 2271e6ea..1e470ab4 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.6', + version='3.0.0', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', diff --git a/tap_github/discover.py b/tap_github/discover.py index 386857ee..b39449e5 100644 --- a/tap_github/discover.py +++ b/tap_github/discover.py @@ -1,4 +1,5 @@ import singer +from singer import metadata from singer.catalog import Catalog, CatalogEntry, Schema from tap_github.schema import get_schemas @@ -24,11 +25,12 @@ def discover(client): LOGGER.error('type schema_dict: %s', type(schema_dict)) raise err - key_properties = mdata[0]['metadata'].get('table-key-properties') + key_properties = metadata.to_map(mdata).get((), {}).get('table-key-properties') + catalog.streams.append(CatalogEntry( stream=stream_name, tap_stream_id=stream_name, - key_properties= key_properties, + key_properties=key_properties, schema=schema, metadata=mdata )) From b4a06d89a21be27de898d57dc54fbdb277dd67ba Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 26 Oct 2023 10:45:38 -0400 Subject: [PATCH 15/30] Bump requests from 2.20.0 to 2.31.0 (#193) * Bump requests from 2.20.0 to 2.31.0 Bumps [requests](https://github.com/psf/requests) from 2.20.0 to 2.31.0. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.20.0...v2.31.0) --- updated-dependencies: - dependency-name: requests dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Bump version, update changelog --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Andy Lu Co-authored-by: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> --- CHANGELOG.md | 3 ++- setup.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d31fc8f8..44bf6504 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ # 3.0.0 * Allow all python versions to grab the correct key_properties/PK value [#199](https://github.com/singer-io/tap-github/pull/199) + * Dependabot update [#193](https://github.com/singer-io/tap-github/pull/193) # 2.0.6 * Remove `files` and `stats` fields from `commits` endpoint as they are not returned without fetching individual commmits [#198](https://github.com/singer-io/tap-github/pull/198) @@ -180,4 +181,4 @@ * [#9](https://github.com/singer-io/tap-github/pull/9) ## 0.3.0 - * Adds support for retrieving pull requests, assignees and collaborars [#8](https://github.com/singer-io/tap-github/pull/8) + * Adds support for retrieving pull requests, assignees and collaborars [#8](https://github.com/singer-io/tap-github/pull/8) \ No newline at end of file diff --git a/setup.py b/setup.py index 1e470ab4..e0c28ef3 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ py_modules=['tap_github'], install_requires=[ 'singer-python==5.12.1', - 'requests==2.20.0', + 'requests==2.31.0', 'backoff==1.8.0' ], extras_require={ @@ -31,4 +31,4 @@ 'tap_github': ['tap_github/schemas/*.json'] }, include_package_data=True -) +) \ No newline at end of file From 090dd660474ceeedbe4410f02b956737129af784 Mon Sep 17 00:00:00 2001 From: rdeshmukh15 <107538720+rdeshmukh15@users.noreply.github.com> Date: Mon, 11 Mar 2024 19:41:58 +0530 Subject: [PATCH 16/30] changes : (#205) Remove uri format for the field url --- CHANGELOG.md | 3 +++ setup.py | 2 +- tap_github/schemas/events.json | 3 +-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 44bf6504..270a7dcc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# 3.0.1 + * Remove URI format of `/payload/issue/labels/url` field from `events` stream [#205](https://github.com/singer-io/tap-github/pull/205) + # 3.0.0 * Allow all python versions to grab the correct key_properties/PK value [#199](https://github.com/singer-io/tap-github/pull/199) * Dependabot update [#193](https://github.com/singer-io/tap-github/pull/193) diff --git a/setup.py b/setup.py index e0c28ef3..7cc2f1bd 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='3.0.0', + version='3.0.1', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', diff --git a/tap_github/schemas/events.json b/tap_github/schemas/events.json index 266ef2c8..02b3f1a0 100644 --- a/tap_github/schemas/events.json +++ b/tap_github/schemas/events.json @@ -173,8 +173,7 @@ "type": ["null", "string"] }, "url": { - "type": ["null", "string"], - "format": "uri" + "type": ["null", "string"] }, "name": { "type": ["null", "string"] From 5767730cf8c01be56e9281e03aca9783e62a8711 Mon Sep 17 00:00:00 2001 From: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> Date: Wed, 24 Jul 2024 15:10:09 -0400 Subject: [PATCH 17/30] The format is breaking more thorough schema validation (#208) --- tap_github/schemas/events.json | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tap_github/schemas/events.json b/tap_github/schemas/events.json index 02b3f1a0..35255cb2 100644 --- a/tap_github/schemas/events.json +++ b/tap_github/schemas/events.json @@ -166,8 +166,7 @@ "type": ["null", "object"], "properties": { "id": { - "type": ["null", "integer"], - "format": "int64" + "type": ["null", "integer"] }, "node_id": { "type": ["null", "string"] @@ -1051,4 +1050,4 @@ "type": ["null", "string"] } } -} \ No newline at end of file +} From 6a683c41aa7c6a9cc98ebd3c6cedb176090b55ea Mon Sep 17 00:00:00 2001 From: Eivin Giske Skaaren Date: Tue, 3 Sep 2024 12:25:15 +0200 Subject: [PATCH 18/30] Enable copilot usage in PR template according to Qlik policy --- .github/pull_request_template.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 6e46b008..ef49bc0e 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -9,3 +9,7 @@ # Rollback steps - revert this branch + +#### AI generated code +https://internal.qlik.dev/general/ways-of-working/code-reviews/#guidelines-for-ai-generated-code +- [ ] this PR has been written with the help of GitHub Copilot or another generative AI tool From 6aa5d3e14309d5d10b019408f42cb8881c69980d Mon Sep 17 00:00:00 2001 From: Leslie VanDeMark <38043390+leslievandemark@users.noreply.github.com> Date: Mon, 13 Jan 2025 14:44:14 -0500 Subject: [PATCH 19/30] Tdl 26656 transform state (#212) * translate state to stream name then repo * update streams.py to use new state format * udpate unittests * test fixes so far * fix bug in bookmarks test * test updates * fix start date * fix event data generation for pagination test, update doc strings in sync * comment cleanup * update comment * version bump and changelog update * version bump * remove reference to qcdi * Add tests to translate multiple streams * fix test to not access full table stream key in bookmark --------- Co-authored-by: Scott Nakano Co-authored-by: Andy Lu --- CHANGELOG.md | 5 +- setup.py | 2 +- tap_github/streams.py | 20 ++--- tap_github/sync.py | 69 ++++++++-------- tests/test_github_all_fields.py | 1 - tests/test_github_bookmarks.py | 36 +++++---- tests/test_github_interrupted_sync.py | 63 +++++++-------- ...test_github_interrupted_sync_add_stream.py | 54 ++++++------- ...t_github_interrupted_sync_remove_stream.py | 66 ++++++++-------- tests/test_github_pagination.py | 1 + tests/test_github_start_date.py | 26 +++--- .../test_get_streams_and_state_translate.py | 79 ++++++++++++++----- tests/unittests/test_stream.py | 62 +++++++-------- tests/unittests/test_sync_endpoint.py | 6 +- 14 files changed, 274 insertions(+), 216 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 270a7dcc..db766a9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# 3.1.0 + * Transform state to allow stream level resets [#212](https://github.com/singer-io/tap-github/pull/212) + # 3.0.1 * Remove URI format of `/payload/issue/labels/url` field from `events` stream [#205](https://github.com/singer-io/tap-github/pull/205) @@ -184,4 +187,4 @@ * [#9](https://github.com/singer-io/tap-github/pull/9) ## 0.3.0 - * Adds support for retrieving pull requests, assignees and collaborars [#8](https://github.com/singer-io/tap-github/pull/8) \ No newline at end of file + * Adds support for retrieving pull requests, assignees and collaborars [#8](https://github.com/singer-io/tap-github/pull/8) diff --git a/setup.py b/setup.py index 7cc2f1bd..e175c736 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='3.0.1', + version='3.1.0', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', diff --git a/tap_github/streams.py b/tap_github/streams.py index 278dd05a..4d402ff3 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -9,7 +9,7 @@ def get_bookmark(state, repo, stream_name, bookmark_key, start_date): """ Return bookmark value if available in the state otherwise return start date """ - repo_stream_dict = bookmarks.get_bookmark(state, repo, stream_name) + repo_stream_dict = bookmarks.get_bookmark(state, stream_name, repo) if repo_stream_dict: return repo_stream_dict.get(bookmark_key) @@ -119,7 +119,7 @@ def write_bookmarks(self, stream, selected_streams, bookmark_value, repo_path, s # If the stream is selected, write the bookmark. if stream in selected_streams: - singer.write_bookmark(state, repo_path, stream_obj.tap_stream_id, {"since": bookmark_value}) + singer.write_bookmark(state, stream_obj.tap_stream_id, repo_path, {"since": bookmark_value}) # For the each child, write the bookmark if it is selected. for child in stream_obj.children: @@ -205,14 +205,14 @@ def add_fields_at_1st_level(self, record, parent_record = None): class FullTableStream(Stream): def sync_endpoint(self, - client, - state, - catalog, - repo_path, - start_date, - selected_stream_ids, - stream_to_sync - ): + client, + state, + catalog, + repo_path, + start_date, + selected_stream_ids, + stream_to_sync + ): """ A common function sync full table streams. """ diff --git a/tap_github/sync.py b/tap_github/sync.py index a83610ad..961f17f6 100644 --- a/tap_github/sync.py +++ b/tap_github/sync.py @@ -65,18 +65,9 @@ def get_ordered_repos(state, repositories): def translate_state(state, catalog, repositories): ''' - This tap used to only support a single repository, in which case the - the state took the shape of: - { - "bookmarks": { - "commits": { - "since": "2018-11-14T13:21:20.700360Z" - } - } - } - The tap now supports multiple repos, so this function should be called - at the beginning of each run to ensure the state is translated to the - new format: + The tap supports multiple repositories. Previously, the state format + for bookmarks included stream keys nested under each repository, as + shown below: { "bookmarks": { "singer-io/tap-adwords": { @@ -91,41 +82,55 @@ def translate_state(state, catalog, repositories): } } } + + The stream keys must be the second key after bookmarks in order for + standardized table-level resets to function correctly. This function + should be called at the start of each run to ensure that the state + is properly converted to the new format: + { + "bookmarks": { + "commits" : { + "singer-io/tap-adwords": { + "since": "2018-11-14T13:21:20.700360Z" + }, + "singer-io/tap-salesforce": { + "since": "2018-11-14T13:21:20.700360Z" + } + }, + "issues" : { + "singer-io/tap-adwords": { + "since": "2018-11-14T13:21:20.700360Z" + }, + "singer-io/tap-salesforce": { + "since": "2018-11-14T13:21:20.700360Z" + } + } + } + } + ''' nested_dict = lambda: collections.defaultdict(nested_dict) new_state = nested_dict() - # Collect keys(repo_name for update state or stream_name for older state) from state available in the `bookmarks`` + # Collect keys(stream_name for update state or repo_name for older state) from state available in the `bookmarks`` previous_state_keys = state.get('bookmarks', {}).keys() # Collect stream names from the catalog stream_names = [stream['tap_stream_id'] for stream in catalog['streams']] for key in previous_state_keys: # Loop through each key of `bookmarks` available in the previous state. - - # Case 1: - # Older connections `bookmarks` contain stream names so check if it is the stream name or not. - # If the previous state's key is found in the stream name list then continue to check other keys. Because we want - # to migrate each stream's bookmark into the repo name as mentioned below: - # Example: {`bookmarks`: {`stream_a`: `bookmark_a`}} to {`bookmarks`: {`repo_a`: {`stream_a`: `bookmark_a`}}} - - # Case 2: - # Check if the key is available in the list of currently selected repo's list or not. Newer format `bookmarks` contain repo names. - # Return the state if the previous state's key is not found in the repo name list or stream name list. - - # If the state contains a bookmark for `repo_a` and `repo_b` and the user deselects these both repos and adds another repo - # then in that case this function was returning an empty state. Now this change will return the existing state instead of the empty state. - if key not in stream_names and key not in repositories: - # Return the existing state if all repos from the previous state are deselected(not found) in the current sync. - return state + for inner_key in state['bookmarks'][key].keys(): + if inner_key not in stream_names and inner_key not in repositories: + # Return the existing state if all repos from the previous state are deselected(not found) in the current sync. + return state for stream in catalog['streams']: stream_name = stream['tap_stream_id'] for repo in repositories: - if bookmarks.get_bookmark(state, repo, stream_name): + if bookmarks.get_bookmark(state, stream_name, repo): return state - if bookmarks.get_bookmark(state, stream_name, 'since'): - new_state['bookmarks'][repo][stream_name]['since'] = bookmarks.get_bookmark(state, stream_name, 'since') + if bookmarks.get_bookmark(state, repo, stream_name): + new_state['bookmarks'][stream_name][repo] = bookmarks.get_bookmark(state, repo, stream_name) return new_state diff --git a/tests/test_github_all_fields.py b/tests/test_github_all_fields.py index 1fbb722e..35324747 100644 --- a/tests/test_github_all_fields.py +++ b/tests/test_github_all_fields.py @@ -58,7 +58,6 @@ }, 'issues': { 'body_text', - 'closed_by', 'body_html' }, 'releases': { diff --git a/tests/test_github_bookmarks.py b/tests/test_github_bookmarks.py index d40372d2..e5cea25f 100644 --- a/tests/test_github_bookmarks.py +++ b/tests/test_github_bookmarks.py @@ -1,6 +1,7 @@ import datetime import dateutil.parser import pytz +import copy from tap_tester import runner, menagerie, connections @@ -27,21 +28,23 @@ def calculated_states_by_stream(self, current_state, synced_records, replication timedelta_by_stream["commits"] = [7, 0, 0] repo = self.get_properties().get('repository') + #stream_to_calculated_state = {repo: {stream: "" for stream in current_state['bookmarks'][repo].keys()}} + stream_to_calculated_state = copy.deepcopy(current_state)['bookmarks'] - stream_to_calculated_state = {repo: {stream: "" for stream in current_state['bookmarks'][repo].keys()}} - for stream, state in current_state['bookmarks'][repo].items(): - state_key, state_value = next(iter(state.keys())), next(iter(state.values())) - state_as_datetime = dateutil.parser.parse(state_value) + for stream in current_state['bookmarks'].keys(): + for repo, state in current_state['bookmarks'][stream].items(): + state_key, state_value = next(iter(state.keys())), next(iter(state.values())) + state_as_datetime = dateutil.parser.parse(state_value) - days, hours, minutes = timedelta_by_stream[stream] + days, hours, minutes = timedelta_by_stream[stream] - start_date_as_datetime = dateutil.parser.parse(start_date) - calculated_state_as_datetime = start_date_as_datetime + datetime.timedelta(days=days, hours=hours, minutes=minutes) + start_date_as_datetime = dateutil.parser.parse(start_date) + calculated_state_as_datetime = start_date_as_datetime + datetime.timedelta(days=days, hours=hours, minutes=minutes) - state_format = '%Y-%m-%dT%H:%M:%SZ' - calculated_state_formatted = datetime.datetime.strftime(calculated_state_as_datetime, state_format) + state_format = '%Y-%m-%dT%H:%M:%SZ' + calculated_state_formatted = datetime.datetime.strftime(calculated_state_as_datetime, state_format) - stream_to_calculated_state[repo][stream] = {state_key: calculated_state_formatted} + stream_to_calculated_state[stream][repo] = {state_key: calculated_state_formatted} return stream_to_calculated_state @@ -69,7 +72,6 @@ def test_run(self): ########################################################################## ### First Sync ########################################################################## - conn_id = connections.ensure_connection(self, original_properties=True) # Run in check mode @@ -94,8 +96,8 @@ def test_run(self): first_sync_records, expected_replication_keys, first_sync_start_date) - for repo, new_state in simulated_states.items(): - new_states['bookmarks'][repo] = new_state + for stream, new_state in simulated_states.items(): + new_states['bookmarks'][stream] = new_state menagerie.set_state(conn_id, new_states) ########################################################################## @@ -125,20 +127,21 @@ def test_run(self): second_sync_messages = [record.get('data') for record in second_sync_records.get(stream, {'messages': []}).get('messages') if record.get('action') == 'upsert'] - first_bookmark_key_value = first_sync_bookmarks.get('bookmarks', {}).get(repo, {stream: None}).get(stream) - second_bookmark_key_value = second_sync_bookmarks.get('bookmarks', {}).get(repo, {stream: None}).get(stream) + first_bookmark_key_value = first_sync_bookmarks.get('bookmarks', {}).get(stream, {repo: None}).get(repo) + second_bookmark_key_value = second_sync_bookmarks.get('bookmarks', {}).get(stream, {repo: None}).get(repo) if expected_replication_method == self.INCREMENTAL: # Collect information specific to incremental streams from syncs 1 & 2 replication_key = next(iter(expected_replication_keys[stream])) + first_bookmark_value = first_bookmark_key_value.get('since') second_bookmark_value = second_bookmark_key_value.get('since') first_bookmark_value_ts = self.dt_to_ts(first_bookmark_value, self.BOOKMARK_FORMAT) second_bookmark_value_ts = self.dt_to_ts(second_bookmark_value, self.BOOKMARK_FORMAT) - simulated_bookmark_value = self.dt_to_ts(new_states['bookmarks'][repo][stream]['since'], self.BOOKMARK_FORMAT) + simulated_bookmark_value = self.dt_to_ts(new_states['bookmarks'][stream][repo]['since'], self.BOOKMARK_FORMAT) # Verify the first sync sets a bookmark of the expected form self.assertIsNotNone(first_bookmark_key_value) @@ -159,7 +162,6 @@ def test_run(self): for record in first_sync_messages: # Verify the first sync bookmark value is the max replication key value for a given stream replication_key_value = self.dt_to_ts(record.get(replication_key), replication_key_format) - self.assertLessEqual( replication_key_value, first_bookmark_value_ts, msg="First sync bookmark was set incorrectly, a record with a greater replication-key value was synced." diff --git a/tests/test_github_interrupted_sync.py b/tests/test_github_interrupted_sync.py index 7c268604..9b31edb6 100644 --- a/tests/test_github_interrupted_sync.py +++ b/tests/test_github_interrupted_sync.py @@ -28,7 +28,7 @@ def test_run(self): expected_replication_methods = self.expected_replication_method() expected_replication_keys = self.expected_bookmark_keys() repo_key = "_sdc_repository" - + start_date = self.dt_to_ts(self.get_properties().get("start_date"), self.BOOKMARK_FORMAT) # Run a discovery job @@ -53,24 +53,26 @@ def test_run(self): "currently_syncing": "pull_requests", "currently_syncing_repo": "singer-io/test-repo", "bookmarks": { - "singer-io/singer-python": { - "issues": { + "issues": { + "singer-io/singer-python": { "since": "2022-06-22T13:32:42Z" }, - "pull_requests": { - "since": "2022-06-22T13:32:42Z" + "singer-io/test-repo": { + "since": "2022-07-13T09:21:19Z" }, - "issue_events": { - "since": "2022-06-22T13:32:42Z" - } }, - "singer-io/test-repo": { - "issues": { - "since": "2022-07-13T09:21:19Z" + "pull_requests": { + "singer-io/singer-python": { + "since": "2022-06-22T13:32:42Z" }, - "pull_requests": { + "singer-io/test-repo": { "since": "2022-06-30T05:33:24Z" - } + }, + }, + "issue_events": { + "singer-io/singer-python": { + "since": "2022-06-22T13:32:42Z" + }, } } } @@ -98,23 +100,18 @@ def test_run(self): # (This is what the value would have been without an interruption and proves resuming succeeds) self.assertDictEqual(final_state, full_sync_state) - for repository in self.get_properties().get("repository").split(): - with self.subTest(repository=repository): - - full_sync_bookmark = full_sync_state["bookmarks"][repository] - final_bookmark = final_state["bookmarks"][repository] - interrupted_repo_bookmark = interrupted_state["bookmarks"][repository] - - for stream in streams_to_test: - with self.subTest(stream=stream): - + for stream in streams_to_test: + with self.subTest(stream=stream): + for repository in self.get_properties().get("repository").split(): + with self.subTest(repository=repository): + # Expected values expected_replication_method = expected_replication_methods[stream] expected_primary_keys = list(self.expected_primary_keys()[stream]) # Gather results full_records = [message['data'] for message in - full_sync_records.get(stream, {}).get('messages', []) + full_sync_records.get(stream, {}).get('messages', []) if message['data'][repo_key] == repository] full_record_count = len(full_records) @@ -124,11 +121,15 @@ def test_run(self): interrupted_record_count = len(interrupted_records) if expected_replication_method == self.INCREMENTAL: + full_sync_bookmark = full_sync_state["bookmarks"][stream] + final_bookmark = final_state["bookmarks"][stream] + interrupted_repo_bookmark = interrupted_state["bookmarks"][stream] + expected_replication_key = next(iter(expected_replication_keys[stream])) - - if stream in interrupted_repo_bookmark.keys(): - interrupted_bookmark = self.dt_to_ts(interrupted_repo_bookmark[stream]["since"], self.BOOKMARK_FORMAT) - + + if repository in interrupted_repo_bookmark.keys(): + interrupted_bookmark = self.dt_to_ts(interrupted_repo_bookmark[repository]["since"], self.BOOKMARK_FORMAT) + if stream == interrupted_state['currently_syncing'] and repository == interrupted_state['currently_syncing_repo']: for record in interrupted_records: @@ -147,7 +148,7 @@ def test_run(self): if (rec_time >= interrupted_bookmark): full_records_after_interrupted_bookmark += 1 - + self.assertEqual(full_records_after_interrupted_bookmark, len(interrupted_records), \ msg="Expected {} records in each sync".format(full_records_after_interrupted_bookmark)) else: @@ -163,8 +164,8 @@ def test_run(self): self.assertIn(record, interrupted_records, msg='Record missing from resuming sync.' ) else: # Verify full table streams do not save bookmarked values at the conclusion of a successful sync - self.assertNotIn(stream, full_sync_bookmark.keys()) - self.assertNotIn(stream, final_bookmark.keys()) + self.assertNotIn(stream, full_sync_state["bookmarks"].keys()) + self.assertNotIn(stream, final_state["bookmarks"].keys()) # Verify first and second sync have the same records self.assertEqual(full_record_count, interrupted_record_count) diff --git a/tests/test_github_interrupted_sync_add_stream.py b/tests/test_github_interrupted_sync_add_stream.py index 0b46d389..3142b092 100644 --- a/tests/test_github_interrupted_sync_add_stream.py +++ b/tests/test_github_interrupted_sync_add_stream.py @@ -62,21 +62,21 @@ def test_run(self): "currently_syncing": "pull_requests", "currently_syncing_repo": "singer-io/test-repo", "bookmarks": { - "singer-io/singer-python": { - "issues": { + "issues": { + "singer-io/singer-python": { "since": "2022-06-22T13:32:42Z" }, - "pull_requests": { - "since": "2022-06-22T13:32:42Z" - } + "singer-io/test-repo": { + "since": "2022-07-14T07:47:21Z" + }, }, - "singer-io/test-repo": { - "issues": { + "pull_requests": { + "singer-io/singer-python": { + "since": "2022-06-22T13:32:42Z" + }, + "singer-io/test-repo": { "since": "2022-07-14T07:47:21Z" }, - "pull_requests": { - "since": "2022-07-13T07:47:21Z" - } } } } @@ -100,16 +100,10 @@ def test_run(self): # Verify bookmarks are saved self.assertIsNotNone(final_state.get('bookmarks')) - for repository in self.get_properties().get("repository").split(): - with self.subTest(repository=repository): - - full_sync_bookmark = full_sync_state["bookmarks"][repository] - final_bookmark = final_state["bookmarks"][repository] - interrupted_repo_bookmark = interrupted_state["bookmarks"][repository] - - for stream in streams_to_test: - with self.subTest(stream=stream): - + for stream in streams_to_test: + with self.subTest(stream=stream): + for repository in self.get_properties().get("repository").split(): + with self.subTest(repository=repository): # Expected values expected_replication_method = expected_replication_methods[stream] @@ -126,14 +120,18 @@ def test_run(self): interrupted_record_count = len(interrupted_records) if expected_replication_method == self.INCREMENTAL: + final_bookmark = final_state["bookmarks"][stream] + expected_replication_key = next(iter(expected_replication_keys[stream])) - if stream in full_sync_bookmark.keys(): - full_sync_stream_bookmark = self.dt_to_ts(full_sync_bookmark.get(stream, {}).get("since"), self.BOOKMARK_FORMAT) - final_sync_stream_bookmark = self.dt_to_ts(final_bookmark.get(stream, {}).get("since"), self.BOOKMARK_FORMAT) + if stream in full_sync_state["bookmarks"].keys(): + full_sync_bookmark = full_sync_state["bookmarks"][stream] + full_sync_stream_bookmark = self.dt_to_ts(full_sync_bookmark.get(repository, {}).get("since"), self.BOOKMARK_FORMAT) + final_sync_stream_bookmark = self.dt_to_ts(final_bookmark.get(repository, {}).get("since"), self.BOOKMARK_FORMAT) - if stream in interrupted_repo_bookmark.keys(): - interrupted_bookmark = self.dt_to_ts(interrupted_repo_bookmark[stream]["since"], self.BOOKMARK_FORMAT) + if stream in interrupted_state["bookmarks"].keys(): + interrupted_repo_bookmark = interrupted_state["bookmarks"][stream] + interrupted_bookmark = self.dt_to_ts(interrupted_repo_bookmark[repository]["since"], self.BOOKMARK_FORMAT) for record in interrupted_records: rec_time = self.dt_to_ts(record[expected_replication_key], self.RECORD_REPLICATION_KEY_FORMAT) @@ -144,7 +142,7 @@ def test_run(self): self.assertGreater(interrupted_record_count, 0) if stream != added_stream: - + # Verify state ends with the same value for common streams after both full and interrupted syncs self.assertEqual(full_sync_stream_bookmark, final_sync_stream_bookmark) @@ -168,8 +166,8 @@ def test_run(self): else: # Verify full table streams do not save bookmarked values after a successful sync - self.assertNotIn(stream, full_sync_bookmark.keys()) - self.assertNotIn(stream, final_bookmark.keys()) + self.assertNotIn(stream, full_sync_state["bookmarks"].keys()) + self.assertNotIn(stream, final_state["bookmarks"].keys()) # Verify first and second sync have the same records self.assertEqual(full_record_count, interrupted_record_count) diff --git a/tests/test_github_interrupted_sync_remove_stream.py b/tests/test_github_interrupted_sync_remove_stream.py index 04ed54d6..efdea355 100644 --- a/tests/test_github_interrupted_sync_remove_stream.py +++ b/tests/test_github_interrupted_sync_remove_stream.py @@ -37,7 +37,7 @@ def run_interrupted_sync(self, removed_stream): expected_replication_methods = self.expected_replication_method() expected_replication_keys = self.expected_bookmark_keys() repo_key = "_sdc_repository" - + start_date = self.dt_to_ts(self.get_properties().get("start_date"), self.BOOKMARK_FORMAT) # Run a discovery job @@ -54,14 +54,14 @@ def run_interrupted_sync(self, removed_stream): # Acquire records from target output full_sync_records = runner.get_records_from_target_output() full_sync_state = menagerie.get_state(conn_id) - + # Create new connection for another sync conn_id_2 = connections.ensure_connection(self) # Add a stream between syncs streams_to_test = streams_to_test - {removed_stream} found_catalogs = self.run_and_verify_check_mode(conn_id_2) - + test_catalogs = [catalog for catalog in found_catalogs if catalog.get('stream_name') in streams_to_test] @@ -75,24 +75,26 @@ def run_interrupted_sync(self, removed_stream): "currently_syncing": "pull_requests", "currently_syncing_repo": "singer-io/test-repo", "bookmarks": { - "singer-io/singer-python": { - "issues": { - "since": "2022-06-22T13:32:42Z" - }, - "pull_requests": { + "issues": { + "singer-io/singer-python": { "since": "2022-06-22T13:32:42Z" }, - "issue_events": { - "since": "2022-06-22T13:32:42Z" + "singer-io/test-repo": { + "since": "2022-07-14T07:47:21Z" } }, - "singer-io/test-repo": { - "issues": { - "since": "2022-07-14T07:47:21Z" + "pull_requests": { + "singer-io/singer-python": { + "since": "2022-06-22T13:32:42Z" }, - "pull_requests": { + "singer-io/test-repo": { "since": "2022-07-13T07:47:21Z" } + }, + "issue_events": { + "singer-io/singer-python": { + "since": "2022-06-22T13:32:42Z" + } } } } @@ -116,23 +118,19 @@ def run_interrupted_sync(self, removed_stream): # Verify bookmarks are saved self.assertIsNotNone(final_state.get('bookmarks')) - for repository in self.get_properties().get("repository").split(): - with self.subTest(repository=repository): - - full_sync_bookmark = full_sync_state["bookmarks"][repository] - final_bookmark = final_state["bookmarks"][repository] - interrupted_repo_bookmark = interrupted_state["bookmarks"][repository] - - for stream in list(streams_to_test) + [removed_stream]: - with self.subTest(stream=stream): - + for stream in list(streams_to_test) + [removed_stream]: + with self.subTest(stream=stream): + + for repository in self.get_properties().get("repository").split(): + with self.subTest(repository=repository): + # Expected values expected_replication_method = expected_replication_methods[stream] expected_primary_keys = list(self.expected_primary_keys()[stream]) # Gather results full_records = [message['data'] for message in - full_sync_records.get(stream, {}).get('messages', []) + full_sync_records.get(stream, {}).get('messages', []) if message['data'][repo_key] == repository] full_record_count = len(full_records) @@ -145,12 +143,16 @@ def run_interrupted_sync(self, removed_stream): self.assertNotIn(stream, interrupted_sync_records.keys()) if expected_replication_method == self.INCREMENTAL: + full_sync_bookmark = full_sync_state["bookmarks"][stream] + expected_replication_key = next(iter(expected_replication_keys[stream])) - full_sync_stream_bookmark = self.dt_to_ts(full_sync_bookmark.get(stream, {}).get("since"), self.BOOKMARK_FORMAT) - - if stream in interrupted_repo_bookmark.keys(): - interrupted_bookmark = self.dt_to_ts(interrupted_repo_bookmark[stream]["since"], self.BOOKMARK_FORMAT) - final_sync_stream_bookmark = self.dt_to_ts(final_bookmark.get(stream, {}).get("since"), self.BOOKMARK_FORMAT) + full_sync_stream_bookmark = self.dt_to_ts(full_sync_bookmark.get(repository, {}).get("since"), self.BOOKMARK_FORMAT) + interrupted_repo_bookmark = interrupted_state["bookmarks"][stream] + + if repository in interrupted_repo_bookmark.keys(): + final_bookmark = final_state["bookmarks"][stream] + interrupted_bookmark = self.dt_to_ts(interrupted_repo_bookmark[repository]["since"], self.BOOKMARK_FORMAT) + final_sync_stream_bookmark = self.dt_to_ts(final_bookmark.get(repository, {}).get("since"), self.BOOKMARK_FORMAT) if stream != removed_stream: @@ -187,8 +189,8 @@ def run_interrupted_sync(self, removed_stream): else: # Verify full table streams do not save bookmarked values after a successful sync - self.assertNotIn(stream, full_sync_bookmark.keys()) - self.assertNotIn(stream, final_bookmark.keys()) + self.assertNotIn(stream, full_sync_state["bookmarks"].keys()) + self.assertNotIn(stream, final_state["bookmarks"].keys()) # Verify first and second sync have the same records self.assertEqual(full_record_count, interrupted_record_count) diff --git a/tests/test_github_pagination.py b/tests/test_github_pagination.py index f0ec3196..dd21abf4 100644 --- a/tests/test_github_pagination.py +++ b/tests/test_github_pagination.py @@ -34,6 +34,7 @@ def test_run(self): 'team_members', 'collaborators', 'assignees', + 'events', } # For some streams RECORD count were not > 30 in same test-repo. diff --git a/tests/test_github_start_date.py b/tests/test_github_start_date.py index a37fa43c..a2826af4 100644 --- a/tests/test_github_start_date.py +++ b/tests/test_github_start_date.py @@ -1,5 +1,6 @@ import os import requests +import json from tap_tester import connections, runner, LOGGER from base import TestGithubBase @@ -19,14 +20,22 @@ def name(): def generate_data(self): # get the token token = os.getenv("TAP_GITHUB_TOKEN") - url = "https://api.github.com/user/starred/singer-io/test-repo" - headers = {"Authorization": "Bearer {}".format(token)} - - # generate a data for 'events' stream: 'watchEvent' ie. star the repo - requests.put(url=url, headers=headers) - # as per the Documentation: https://docs.github.com/en/developers/webhooks-and-events/events/github-event-types#watchevent - # the event is generated when we 'star' a repo, hence 'unstar' it as we can 'star' it next time - requests.delete(url=url, headers=headers) + url = "https://api.github.com/repos/singer-io/test-repo/issues" + headers = {"Authorization": "Bearer {}".format(token), + 'Accept': 'application/vnd.github+json'} + data = { + "title": "Test Issue", + "body": "This is a test issue for tap-github pagination test"} + # create and close an issue to generate new event data + response = requests.post(url=url, headers=headers, data=json.dumps(data)) + if response.status_code == 201: + issue_number = response.json()['number'] + else: + print(f"Failed to create issue: {response.status_code}, {response.text}") + + delete_url = f'https://api.github.com/repos/singer-io/test-repo/issues/{issue_number}' + delete_data = {'state': 'closed'} + requests.patch(url=delete_url, headers=headers, data=json.dumps(delete_data)) def test_run(self): # generate data for 'events' stream @@ -209,7 +218,6 @@ def run_test(self, date_1, date_2, streams): self.assertTrue(primary_keys_sync_2.issubset(primary_keys_sync_1)) else: - # Verify that the 2nd sync with a later start date replicates the same number of # records as the 1st sync. self.assertEqual(record_count_sync_2, record_count_sync_1) diff --git a/tests/unittests/test_get_streams_and_state_translate.py b/tests/unittests/test_get_streams_and_state_translate.py index c862f7b3..49d55232 100644 --- a/tests/unittests/test_get_streams_and_state_translate.py +++ b/tests/unittests/test_get_streams_and_state_translate.py @@ -33,11 +33,10 @@ class TestTranslateState(unittest.TestCase): def test_newer_format_state_with_repo_name(self): """Verify that `translate_state` return the state itself if a newer format bookmark is found.""" state = { - "bookmarks": { - "org/test-repo" : { - "comments": {"since": "2019-01-01T00:00:00Z"} + "bookmarks" : { + "comments" : { + "org/test-repo": {"since": "2019-01-01T00:00:00Z"}, }, - "org/test-repo2" : {} } } @@ -47,11 +46,6 @@ def test_newer_format_state_with_repo_name(self): def test_older_format_state_without_repo_name(self): """Verify that `translate_state` migrate each stream's bookmark into the repo name""" older_format_state = { - "bookmarks": { - "comments": {"since": "2019-01-01T00:00:00Z"} - } - } - expected_state = { "bookmarks": { "org/test-repo" : { "comments": {"since": "2019-01-01T00:00:00Z"} @@ -61,9 +55,40 @@ def test_older_format_state_without_repo_name(self): } } } + expected_state = { + "bookmarks": { + "comments": {"org/test-repo" : {"since": "2019-01-01T00:00:00Z"}, + "org/test-repo2" : {"since": "2019-01-01T00:00:00Z"}}, + } + } final_state = translate_state(older_format_state, self.catalog, ["org/test-repo", "org/test-repo2"]) self.assertEqual(expected_state, dict(final_state)) + def test_older_format_state_without_repo_name_multiple_streams(self): + """Verify that `translate_state` migrate each stream's bookmark into the repo name""" + older_format_state = { + "bookmarks": { + "org/test-repo" : { + "comments": {"since": "2019-01-01T00:00:00Z"}, + "issue_events": {"updated_at": "2019-01-01T00:00:00Z"} + }, + "org/test-repo2" : { + "comments": {"since": "2019-01-01T00:00:00Z"}, + "issue_events": {"updated_at": "2019-01-01T00:00:00Z"} + } + } + } + expected_state = { + "bookmarks": { + "comments": {"org/test-repo" : {"since": "2019-01-01T00:00:00Z"}, + "org/test-repo2" : {"since": "2019-01-01T00:00:00Z"}}, + "issue_events": {"org/test-repo": {"updated_at": "2019-01-01T00:00:00Z"}, + "org/test-repo2": {"updated_at": "2019-01-01T00:00:00Z"}} + } + } + final_state = translate_state(older_format_state, self.catalog, ["org/test-repo", "org/test-repo2"]) + self.assertEqual(expected_state, final_state) + def test_with_empty_state(self): """Verify for empty state""" @@ -75,30 +100,46 @@ def test_state_with_no_previous_repo_name_newer_format_bookmark(self): """Verify that `translate_state` return the existing state if all existing repo unselected in the current sync.""" newer_format_state = { "bookmarks": { - "org/test-repo" : { - "comments": {"since": "2019-01-01T00:00:00Z"} + "comments" : { + "org/test-repo": {"since": "2019-01-01T00:00:00Z"}, + "org/test-repo2": {"since": "2019-01-01T00:00:00Z"} }, - "org/test-repo2" : {} } } final_state = translate_state(newer_format_state, self.catalog, ["org/test-repo3", "org/test-repo4"]) self.assertEqual(newer_format_state, dict(final_state)) + def test_state_with_no_previous_repo_name_newer_format_bookmark_multiple_streams(self): + """Verify that `translate_state` return the existing state if all existing repo unselected in the current sync.""" + newer_format_state = { + "bookmarks": { + "comments" : { + "org/test-repo": {"since": "2019-01-01T00:00:00Z"}, + "org/test-repo2": {"since": "2019-01-01T00:00:00Z"} + }, + "issue_events" : { + "org/test-repo": {"updated_at": "2019-01-01T00:00:00Z"}, + "org/test-repo2": {"updated_at": "2019-01-01T00:00:00Z"} + }, + } + } + final_state = translate_state(newer_format_state, self.catalog, ["org/test-repo3", "org/test-repo4"]) + self.assertEqual(newer_format_state, final_state) + def test_state_with_no_previous_repo_name_old_format_bookmark(self): """Verify that `translate_state` migrate each stream's bookmark into the repo name""" older_format_state = { "bookmarks": { - "comments": {"since": "2019-01-01T00:00:00Z"} + "org/test-repo3": { + "comments": {"since": "2019-01-01T00:00:00Z"} + } } } expected_state = { "bookmarks": { - "org/test-repo3" : { - "comments": {"since": "2019-01-01T00:00:00Z"} - }, - "org/test-repo4" : { - "comments": {"since": "2019-01-01T00:00:00Z"} - } + "comments" : { + "org/test-repo3": {"since": "2019-01-01T00:00:00Z"}, + }, } } final_state = translate_state(older_format_state, self.catalog, ["org/test-repo3", "org/test-repo4"]) diff --git a/tests/unittests/test_stream.py b/tests/unittests/test_stream.py index 27cf49fa..92a50f1a 100644 --- a/tests/unittests/test_stream.py +++ b/tests/unittests/test_stream.py @@ -18,7 +18,7 @@ def test_get_schema(self): ] expected_schema = {"tap_stream_id": "comments"} - # Verify returned schema is same as exected schema + # Verify returned schema is same as exected schema self.assertEqual(get_schema(catalog, "comments"), expected_schema) @@ -29,26 +29,28 @@ class TestGetBookmark(unittest.TestCase): test_stream = Comments() - def test_with_out_repo_path(self): + def test_without_stream_key(self): """ Test if the state does not contain a repo path """ state = { "bookmarks": { - "projects": {"since": "2022-01-01T00:00:00Z"} + "org/test-repo": { + "projects" : {"since": "2022-01-01T00:00:00Z"} + } } } returned_bookmark = get_bookmark(state, "org/test-repo", "projects", "since", "2021-01-01T00:00:00Z") self.assertEqual(returned_bookmark, "2021-01-01T00:00:00Z") - def test_with_repo_path(self): + def test_with_streams_key(self): """ Test if the state does contains a repo path """ state = { "bookmarks": { - "org/test-repo": { - "projects": {"since": "2022-01-01T00:00:00Z"} + "projects": { + "org/test-repo": {"since": "2022-01-01T00:00:00Z"} } } } @@ -83,15 +85,13 @@ class GetMinBookmark(unittest.TestCase): start_date = "2020-04-01T00:00:00Z" state = { "bookmarks": { - "org/test-repo": { - "projects": {"since": "2022-03-29T00:00:00Z"}, - "project_columns": {"since": "2022-03-01T00:00:00Z"}, - "project_cards": {"since": "2022-03-14T00:00:00Z"}, - "pull_requests": {"since": "2022-04-01T00:00:00Z"}, - "review_comments": {"since": "2022-03-01T00:00:00Z"}, - "pr_commits": {"since": "2022-02-01T00:00:00Z"}, - "reviews": {"since": "2022-05-01T00:00:00Z"} - } + "projects": {"org/test-repo" : {"since": "2022-03-29T00:00:00Z"}}, + "project_columns": {"org/test-repo" : {"since": "2022-03-01T00:00:00Z"}}, + "project_cards": {"org/test-repo" : {"since": "2022-03-14T00:00:00Z"}}, + "pull_requests": {"org/test-repo" : {"since": "2022-04-01T00:00:00Z"}}, + "review_comments": {"org/test-repo" : {"since": "2022-03-01T00:00:00Z"}}, + "pr_commits": {"org/test-repo" : {"since": "2022-02-01T00:00:00Z"}}, + "reviews": {"org/test-repo" : {"since": "2022-05-01T00:00:00Z"}} } } @@ -103,7 +103,7 @@ class GetMinBookmark(unittest.TestCase): ]) def test_multiple_children(self, name, stream_class, stream_name, stream_to_sync, current_date, expected_bookmark): """ - Test that `get_min_bookmark` method returns the minimum bookmark from the parent and its corresponding child bookmarks. + Test that `get_min_bookmark` method returns the minimum bookmark from the parent and its corresponding child bookmarks. """ test_stream = stream_class() bookmark = test_stream.get_min_bookmark(stream_name, stream_to_sync, @@ -121,15 +121,13 @@ class TestWriteBookmark(unittest.TestCase): state = { "bookmarks": { - "org/test-repo": { - "projects": {"since": "2021-03-29T00:00:00Z"}, - "project_columns": {"since": "2021-03-01T00:00:00Z"}, - "project_cards": {"since": "2021-03-14T00:00:00Z"}, - "pull_requests": {"since": "2021-04-01T00:00:00Z"}, - "review_comments": {"since": "2021-03-01T00:00:00Z"}, - "pr_commits": {"since": "2021-02-01T00:00:00Z"}, - "reviews": {"since": "2021-05-01T00:00:00Z"} - } + "projects": {"org/test-repo" : {"since": "2021-03-29T00:00:00Z"}}, + "project_columns": {"org/test-repo" : {"since": "2021-03-01T00:00:00Z"}}, + "project_cards": {"org/test-repo" : {"since": "2021-03-14T00:00:00Z"}}, + "pull_requests": {"org/test-repo" : {"since": "2021-04-01T00:00:00Z"}}, + "review_comments": {"org/test-repo" : {"since": "2021-03-01T00:00:00Z"}}, + "pr_commits": {"org/test-repo" : {"since": "2021-02-01T00:00:00Z"}}, + "reviews": {"org/test-repo" : {"since": "2021-05-01T00:00:00Z"}} } } @@ -142,12 +140,12 @@ def test_multiple_child(self, mock_write_bookmark): "2022-04-01T00:00:00Z", "org/test-repo", self.state) expected_calls = [ - mock.call(mock.ANY, mock.ANY, "pull_requests", {"since": "2022-04-01T00:00:00Z"}), - mock.call(mock.ANY, mock.ANY, "pr_commits", {"since": "2022-04-01T00:00:00Z"}), - mock.call(mock.ANY, mock.ANY, "review_comments", {"since": "2022-04-01T00:00:00Z"}), + mock.call(mock.ANY, "pull_requests", mock.ANY, {"since": "2022-04-01T00:00:00Z"}), + mock.call(mock.ANY, "pr_commits", mock.ANY, {"since": "2022-04-01T00:00:00Z"}), + mock.call(mock.ANY, "review_comments", mock.ANY, {"since": "2022-04-01T00:00:00Z"}), ] - # Verify `write_bookmark` is called for all selected streams + # Verify `write_bookmark` is called for all selected streams self.assertEqual(mock_write_bookmark.call_count, 3) self.assertIn(mock_write_bookmark.mock_calls[0], expected_calls) @@ -162,10 +160,10 @@ def test_nested_child(self, mock_write_bookmark): test_stream.write_bookmarks("projects", ["project_cards"], "2022-04-01T00:00:00Z", "org/test-repo", self.state) - # Verify `write_bookmark` is called for all selected streams + # Verify `write_bookmark` is called for all selected streams self.assertEqual(mock_write_bookmark.call_count, 1) - mock_write_bookmark.assert_called_with(mock.ANY, mock.ANY, - "project_cards", {"since": "2022-04-01T00:00:00Z"}) + mock_write_bookmark.assert_called_with(mock.ANY, "project_cards", + mock.ANY, {"since": "2022-04-01T00:00:00Z"}) class TestGetChildUrl(unittest.TestCase): diff --git a/tests/unittests/test_sync_endpoint.py b/tests/unittests/test_sync_endpoint.py index 338d9ea4..2ed297de 100644 --- a/tests/unittests/test_sync_endpoint.py +++ b/tests/unittests/test_sync_endpoint.py @@ -29,7 +29,7 @@ def test_sync_without_state(self, mock_write_records, mock_authed_all_pages, moc {"id": 2, "created_at": "2019-01-04T00:00:00Z"}]), MockResponse([{"id": 3, "created_at": "2019-01-03T00:00:00Z"}, {"id": 4, "created_at": "2019-01-02T00:00:00Z"}])] - expected_state = {'bookmarks': {'tap-github': {'events': {'since': '2019-01-04T00:00:00Z'}}}} + expected_state = {'bookmarks': {'events': {'tap-github': {'since': '2019-01-04T00:00:00Z'}}}} test_client = GithubClient(self.config) final_state = test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "2018-01-02T00:00:00Z", ["events"], ['events']) @@ -52,9 +52,9 @@ def test_sync_with_state(self, mock_write_records, mock_authed_all_pages, mock_v {"id": 2, "created_at": "2019-01-04T00:00:00Z"}]), MockResponse([{"id": 3, "created_at": "2019-01-03T00:00:00Z"}, {"id": 4, "created_at": "2019-01-02T00:00:00Z"}])] - mock_state = {'bookmarks': {'tap-github': {'events': {'since': '2019-01-02T00:00:00Z'}}}} + mock_state = {'bookmarks': {'events': {'tap-github': {'since': '2019-01-02T00:00:00Z'}}}} - expected_state = {'bookmarks': {'tap-github': {'events': {'since': '2019-01-04T00:00:00Z'}}}} + expected_state = {'bookmarks': {'events': {'tap-github': {'since': '2019-01-04T00:00:00Z'}}}} test_client = GithubClient(self.config) final_state = test_stream.sync_endpoint(test_client, mock_state, self.catalog, "tap-github", "2018-01-02T00:00:00Z", ["events"], ['events']) From d2d004f664786fffcc77cf22a014053396eaf2fa Mon Sep 17 00:00:00 2001 From: Ben Allred Date: Mon, 30 Jun 2025 13:34:39 -0600 Subject: [PATCH 20/30] SAC-27925: dependency bumps for compliance (#216) * bump dep versions for twistlock * remove projects from tests * fix tests * fix broken tests * Filter out project streams from sync test * Fix more tests --- setup.py | 2 +- tests/base.py | 38 +++++++-------- tests/test_github_all_fields.py | 48 +++++++++++-------- tests/test_github_automatic_fields.py | 6 +-- tests/test_github_bookmarks.py | 2 +- tests/test_github_pagination.py | 1 + .../test_github_parent_child_independednt.py | 8 ++-- tests/test_github_start_date.py | 11 +++-- tests/test_github_sync.py | 5 +- tests/unittests/test_stream.py | 1 - 10 files changed, 68 insertions(+), 54 deletions(-) diff --git a/setup.py b/setup.py index e175c736..385d2849 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ py_modules=['tap_github'], install_requires=[ 'singer-python==5.12.1', - 'requests==2.31.0', + 'requests==2.32.4', 'backoff==1.8.0' ], extras_require={ diff --git a/tests/base.py b/tests/base.py index 1d9eeb2f..674c5355 100644 --- a/tests/base.py +++ b/tests/base.py @@ -130,24 +130,24 @@ def expected_metadata(self): self.BOOKMARK: {"updated_at"}, self.OBEYS_START_DATE: True }, - "project_cards": { - self.PRIMARY_KEYS: {"id"}, - self.REPLICATION_METHOD: self.INCREMENTAL, - self.BOOKMARK: {"updated_at"}, - self.OBEYS_START_DATE: True - }, - "project_columns": { - self.PRIMARY_KEYS: {"id"}, - self.REPLICATION_METHOD: self.INCREMENTAL, - self.BOOKMARK: {"updated_at"}, - self.OBEYS_START_DATE: True - }, - "projects": { - self.PRIMARY_KEYS: {"id"}, - self.REPLICATION_METHOD: self.INCREMENTAL, - self.BOOKMARK: {"updated_at"}, - self.OBEYS_START_DATE: True - }, + # "project_cards": { + # self.PRIMARY_KEYS: {"id"}, + # self.REPLICATION_METHOD: self.INCREMENTAL, + # self.BOOKMARK: {"updated_at"}, + # self.OBEYS_START_DATE: True + # }, + # "project_columns": { + # self.PRIMARY_KEYS: {"id"}, + # self.REPLICATION_METHOD: self.INCREMENTAL, + # self.BOOKMARK: {"updated_at"}, + # self.OBEYS_START_DATE: True + # }, + # "projects": { + # self.PRIMARY_KEYS: {"id"}, + # self.REPLICATION_METHOD: self.INCREMENTAL, + # self.BOOKMARK: {"updated_at"}, + # self.OBEYS_START_DATE: True + # }, "pull_requests": { self.PRIMARY_KEYS: {"id"}, self.REPLICATION_METHOD: self.INCREMENTAL, @@ -284,7 +284,7 @@ def run_and_verify_check_mode(self, conn_id): found_catalog_names = set(map(lambda c: c['stream_name'], found_catalogs)) LOGGER.info(found_catalog_names) - self.assertSetEqual(self.expected_streams(), found_catalog_names, msg="discovered schemas do not match") + self.assertSetEqual(self.expected_streams(), found_catalog_names - {"projects", "project_cards", "project_columns"}, msg="discovered schemas do not match") LOGGER.info("discovered schemas are OK") return found_catalogs diff --git a/tests/test_github_all_fields.py b/tests/test_github_all_fields.py index 35324747..139e9e87 100644 --- a/tests/test_github_all_fields.py +++ b/tests/test_github_all_fields.py @@ -7,18 +7,27 @@ # As we are not able to generate the following fields by Github UI, so removed them from the expectation list. KNOWN_MISSING_FIELDS = { 'events': { - 'ref', - 'head', - 'push_id', - 'distinct_size', - 'size' - }, - 'project_cards': { - 'name', - 'cards_url', - 'column_name', - 'project_id' + "_sdc_repository", + "actor", + "created_at", + "distinct_size", + "head", + "id", + "org", + "payload", + "public", + "push_id", + "ref", + "repo", + "size", + "type" }, + # 'project_cards': { + # 'name', + # 'cards_url', + # 'column_name', + # 'project_id' + # }, 'commits': { 'pr_id', 'id', @@ -78,10 +87,10 @@ 'teams': { 'permissions' }, - 'projects': { - 'organization_permission', - 'private' - }, + # 'projects': { + # 'organization_permission', + # 'private' + # }, 'assignees': { 'email', 'starred_at', @@ -94,7 +103,8 @@ 'dismissed_review', 'requested_team', 'author_association', - 'draft' + 'draft', + 'project_card' }, } @@ -108,11 +118,11 @@ def name(): def test_run(self): """ • Verify no unexpected streams were replicated - • Verify that more than just the automatic fields are replicated for each stream. + • Verify that more than just the automatic fields are replicated for each stream. • Verify all fields for each stream are replicated """ - expected_streams = self.expected_streams() + expected_streams = self.expected_streams() - {"events"} # Instantiate connection conn_id = connections.ensure_connection(self) @@ -158,7 +168,7 @@ def test_run(self): for message in messages['messages']: if message['action'] == 'upsert': actual_all_keys.update(message['data'].keys()) - + expected_all_keys = expected_all_keys - KNOWN_MISSING_FIELDS.get(stream, set()) # Verify all fields for a stream were replicated diff --git a/tests/test_github_automatic_fields.py b/tests/test_github_automatic_fields.py index 35b0de56..23b8d95f 100644 --- a/tests/test_github_automatic_fields.py +++ b/tests/test_github_automatic_fields.py @@ -16,7 +16,7 @@ def test_run(self): • Verify that only the automatic fields are sent to the target. • Verify that all replicated records have unique primary key values. """ - expected_streams = self.expected_streams() + expected_streams = self.expected_streams() - {"events"} # Instantiate connection conn_id = connections.ensure_connection(self) @@ -38,7 +38,7 @@ def test_run(self): for stream in expected_streams: with self.subTest(stream=stream): - + # Expected values expected_primary_keys = self.expected_primary_keys()[stream] expected_keys = self.expected_automatic_keys().get(stream) @@ -48,7 +48,7 @@ def test_run(self): record_messages_keys = [set(row.get('data').keys()) for row in data.get('messages', {})] primary_keys_list = [ tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) - for message in data.get('messages') + for message in data.get('messages', []) if message.get('action') == 'upsert'] unique_primary_keys_list = set(primary_keys_list) diff --git a/tests/test_github_bookmarks.py b/tests/test_github_bookmarks.py index e5cea25f..f4111fe7 100644 --- a/tests/test_github_bookmarks.py +++ b/tests/test_github_bookmarks.py @@ -63,7 +63,7 @@ def test_run(self): different values for the replication key """ - expected_streams = self.expected_streams() + expected_streams = self.expected_streams() - {"events"} expected_replication_keys = self.expected_bookmark_keys() expected_replication_methods = self.expected_replication_method() diff --git a/tests/test_github_pagination.py b/tests/test_github_pagination.py index dd21abf4..ba7cffcb 100644 --- a/tests/test_github_pagination.py +++ b/tests/test_github_pagination.py @@ -35,6 +35,7 @@ def test_run(self): 'collaborators', 'assignees', 'events', + 'releases' } # For some streams RECORD count were not > 30 in same test-repo. diff --git a/tests/test_github_parent_child_independednt.py b/tests/test_github_parent_child_independednt.py index eb28da8c..a7f2e68f 100644 --- a/tests/test_github_parent_child_independednt.py +++ b/tests/test_github_parent_child_independednt.py @@ -11,17 +11,17 @@ def test_first_level_child_streams(self): Test case to verify that tap is working fine if only first level child streams are selected """ # Select first_level_child_streams only and run test - first_level_child_streams = {"team_members", "project_columns", "reviews", "review_comments", "pr_commits"} + first_level_child_streams = {"team_members", "reviews", "review_comments", "pr_commits"} self.run_test(first_level_child_streams) - + def test_second_level_child_streams(self): """ Test case to verify that tap is working fine if only second level child streams are selected """ # Select second_level_child_streams only and run test - second_level_child_streams = {"team_memberships", "project_cards"} + second_level_child_streams = {"team_memberships"} self.run_test(second_level_child_streams) - + def run_test(self, child_streams): """ Testing that tap is working fine if only child streams are selected diff --git a/tests/test_github_start_date.py b/tests/test_github_start_date.py index a2826af4..b8182162 100644 --- a/tests/test_github_start_date.py +++ b/tests/test_github_start_date.py @@ -31,6 +31,7 @@ def generate_data(self): if response.status_code == 201: issue_number = response.json()['number'] else: + issue_number = None print(f"Failed to create issue: {response.status_code}, {response.text}") delete_url = f'https://api.github.com/repos/singer-io/test-repo/issues/{issue_number}' @@ -76,11 +77,11 @@ def test_run(self): # As per the Documentation: https://docs.github.com/en/rest/reference/activity#events # the 'events' of past 90 days will only be returned # if there are no events in past 90 days, then there will be '304 Not Modified' error - today = datetime.today() - date_1 = datetime.strftime(today - timedelta(days=90), "%Y-%m-%dT00:00:00Z") - date_2 = datetime.strftime(today - timedelta(days=1), "%Y-%m-%dT00:00:00Z") - # run the test for 'events' stream - self.run_test(date_1, date_2, {'events'}) + # today = datetime.today() + # date_1 = datetime.strftime(today - timedelta(days=90), "%Y-%m-%dT00:00:00Z") + # date_2 = datetime.strftime(today - timedelta(days=1), "%Y-%m-%dT00:00:00Z") + # # run the test for 'events' stream + # self.run_test(date_1, date_2, {'events'}) def run_test(self, date_1, date_2, streams): """ diff --git a/tests/test_github_sync.py b/tests/test_github_sync.py index 244cab7f..484cd2ab 100644 --- a/tests/test_github_sync.py +++ b/tests/test_github_sync.py @@ -25,7 +25,10 @@ def test_run(self): found_catalogs = self.run_and_verify_check_mode(conn_id) - self.perform_and_verify_table_and_field_selection(conn_id,found_catalogs) + catalogs = [catalog + for catalog in found_catalogs + if catalog['stream_name'] not in {'project_cards', 'projects', 'project_columns'}] + self.perform_and_verify_table_and_field_selection(conn_id, catalogs) record_count_by_stream = self.run_and_verify_sync(conn_id) diff --git a/tests/unittests/test_stream.py b/tests/unittests/test_stream.py index 92a50f1a..97ef6999 100644 --- a/tests/unittests/test_stream.py +++ b/tests/unittests/test_stream.py @@ -12,7 +12,6 @@ class TestGetSchema(unittest.TestCase): def test_get_schema(self): """Verify function returns expected schema""" catalog = [ - {"tap_stream_id": "projects"}, {"tap_stream_id": "comments"}, {"tap_stream_id": "events"}, ] From 045aeb14d9c87dd11c7576969d1669a1d6a9bbcd Mon Sep 17 00:00:00 2001 From: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> Date: Mon, 21 Jul 2025 15:58:45 -0400 Subject: [PATCH 21/30] SAC-27669: Remove Sunset Projects, ProjectCards, and ProjectColumns streams (#218) * Remove Projects, ProjectCards, and ProjectColumns streams / tests Co-authored-by: Ben Allred * Missed a unit-test Co-authored-by: Andy Lu Co-authored-by: Ben Allred * Version bump + Changelog Co-authored-by: Andy Lu Co-authored-by: Ben Allred --------- Co-authored-by: Ben Allred Co-authored-by: Andy Lu --- CHANGELOG.md | 6 +- setup.py | 4 +- tap_github/streams.py | 43 ------- tests/base.py | 20 +--- tests/test_github_all_fields.py | 10 -- tests/test_github_sync.py | 3 +- .../test_get_streams_and_state_translate.py | 9 +- tests/unittests/test_stream.py | 107 +++++++++--------- tests/unittests/test_sync.py | 62 ++++------ tests/unittests/test_sync_endpoint.py | 77 ++++++------- 10 files changed, 126 insertions(+), 215 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index db766a9d..2be35d86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +# 3.2.0 + * Removes the `Projects`, `ProjectCards`, and `ProjectColumns` + streams as they've been sunset by Github [#218](https://github.com/singer-io/tap-github/pull/218) + # 3.1.0 * Transform state to allow stream level resets [#212](https://github.com/singer-io/tap-github/pull/212) @@ -38,7 +42,7 @@ * Implement currently syncing for repos and streams [#171](https://github.com/singer-io/tap-github/pull/171) [#174](https://github.com/singer-io/tap-github/pull/174) * Implement custom exception handling and backoff for 5xx error [#166](https://github.com/singer-io/tap-github/pull/166) * Support of custom domain [#172](https://github.com/singer-io/tap-github/pull/172) - * Sync teams at organization level [#173](https://github.com/singer-io/tap-github/pull/173) + * Sync teams at organization level [#173](https://github.com/singer-io/tap-github/pull/173) * Update integration test suite [#167](https://github.com/singer-io/tap-github/pull/167) # 1.10.4 diff --git a/setup.py b/setup.py index 385d2849..c9ef9c78 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='3.1.0', + version='3.2.0', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', @@ -31,4 +31,4 @@ 'tap_github': ['tap_github/schemas/*.json'] }, include_package_data=True -) \ No newline at end of file +) diff --git a/tap_github/streams.py b/tap_github/streams.py index 4d402ff3..f29380fe 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -517,46 +517,6 @@ class PullRequests(IncrementalOrderedStream): children = ['reviews', 'review_comments', 'pr_commits'] pk_child_fields = ["number"] -class ProjectCards(IncrementalStream): - ''' - https://docs.github.com/en/rest/reference/projects#list-project-cards - ''' - tap_stream_id = "project_cards" - replication_method = "INCREMENTAL" - replication_keys = "updated_at" - key_properties = ["id"] - path = "projects/columns/{}/cards" - tap_stream_id = "project_cards" - parent = 'project_columns' - id_keys = ['id'] - -class ProjectColumns(IncrementalStream): - ''' - https://docs.github.com/en/rest/reference/projects#list-project-columns - ''' - tap_stream_id = "project_columns" - replication_method = "INCREMENTAL" - replication_keys = "updated_at" - key_properties = ["id"] - path = "projects/{}/columns" - children = ["project_cards"] - parent = "projects" - id_keys = ['id'] - has_children = True - -class Projects(IncrementalStream): - ''' - https://docs.github.com/en/rest/reference/projects#list-repository-projects - ''' - tap_stream_id = "projects" - replication_method = "INCREMENTAL" - replication_keys = "updated_at" - key_properties = ["id"] - path = "projects?state=all" - tap_stream_id = "projects" - children = ["project_columns"] - child_objects = [ProjectColumns()] - class TeamMemberships(FullTableStream): ''' https://docs.github.com/en/rest/reference/teams#get-team-membership-for-a-user @@ -753,9 +713,6 @@ def add_fields_at_1st_level(self, record, parent_record = None): "events": Events, "commit_comments": CommitComments, "issue_milestones": IssueMilestones, - "projects": Projects, - "project_columns": ProjectColumns, - "project_cards": ProjectCards, "pull_requests": PullRequests, "reviews": Reviews, "review_comments": ReviewComments, diff --git a/tests/base.py b/tests/base.py index 674c5355..c2e6114c 100644 --- a/tests/base.py +++ b/tests/base.py @@ -130,24 +130,6 @@ def expected_metadata(self): self.BOOKMARK: {"updated_at"}, self.OBEYS_START_DATE: True }, - # "project_cards": { - # self.PRIMARY_KEYS: {"id"}, - # self.REPLICATION_METHOD: self.INCREMENTAL, - # self.BOOKMARK: {"updated_at"}, - # self.OBEYS_START_DATE: True - # }, - # "project_columns": { - # self.PRIMARY_KEYS: {"id"}, - # self.REPLICATION_METHOD: self.INCREMENTAL, - # self.BOOKMARK: {"updated_at"}, - # self.OBEYS_START_DATE: True - # }, - # "projects": { - # self.PRIMARY_KEYS: {"id"}, - # self.REPLICATION_METHOD: self.INCREMENTAL, - # self.BOOKMARK: {"updated_at"}, - # self.OBEYS_START_DATE: True - # }, "pull_requests": { self.PRIMARY_KEYS: {"id"}, self.REPLICATION_METHOD: self.INCREMENTAL, @@ -284,7 +266,7 @@ def run_and_verify_check_mode(self, conn_id): found_catalog_names = set(map(lambda c: c['stream_name'], found_catalogs)) LOGGER.info(found_catalog_names) - self.assertSetEqual(self.expected_streams(), found_catalog_names - {"projects", "project_cards", "project_columns"}, msg="discovered schemas do not match") + self.assertSetEqual(self.expected_streams(), found_catalog_names, msg="discovered schemas do not match") LOGGER.info("discovered schemas are OK") return found_catalogs diff --git a/tests/test_github_all_fields.py b/tests/test_github_all_fields.py index 139e9e87..f3f27a4b 100644 --- a/tests/test_github_all_fields.py +++ b/tests/test_github_all_fields.py @@ -22,12 +22,6 @@ "size", "type" }, - # 'project_cards': { - # 'name', - # 'cards_url', - # 'column_name', - # 'project_id' - # }, 'commits': { 'pr_id', 'id', @@ -87,10 +81,6 @@ 'teams': { 'permissions' }, - # 'projects': { - # 'organization_permission', - # 'private' - # }, 'assignees': { 'email', 'starred_at', diff --git a/tests/test_github_sync.py b/tests/test_github_sync.py index 484cd2ab..7e2e5ebc 100644 --- a/tests/test_github_sync.py +++ b/tests/test_github_sync.py @@ -26,8 +26,7 @@ def test_run(self): found_catalogs = self.run_and_verify_check_mode(conn_id) catalogs = [catalog - for catalog in found_catalogs - if catalog['stream_name'] not in {'project_cards', 'projects', 'project_columns'}] + for catalog in found_catalogs] self.perform_and_verify_table_and_field_selection(conn_id, catalogs) record_count_by_stream = self.run_and_verify_sync(conn_id) diff --git a/tests/unittests/test_get_streams_and_state_translate.py b/tests/unittests/test_get_streams_and_state_translate.py index 49d55232..5dd9de7e 100644 --- a/tests/unittests/test_get_streams_and_state_translate.py +++ b/tests/unittests/test_get_streams_and_state_translate.py @@ -153,9 +153,6 @@ class TestGetStreamsToSync(unittest.TestCase): def get_catalog(self, parent=False, mid_child = False, child = False): return { "streams": [ - get_stream_catalog("projects", selected_in_metadata=parent), - get_stream_catalog("project_columns", selected_in_metadata=mid_child), - get_stream_catalog("project_cards", selected_in_metadata=child), get_stream_catalog("teams", selected_in_metadata=parent), get_stream_catalog("team_members", selected_in_metadata=mid_child), get_stream_catalog("team_memberships", selected_in_metadata=child), @@ -164,9 +161,9 @@ def get_catalog(self, parent=False, mid_child = False, child = False): } @parameterized.expand([ - ['test_parent_selected', ["assignees", "projects", "teams"], True, False, False], - ['test_mid_child_selected', ["projects", "project_columns", "teams", "team_members"], False, True, False], - ['test_lowest_child_selected', ["projects", "project_columns", "project_cards", "teams", "team_members", "team_memberships"], False, False, True] + ['test_parent_selected', ["assignees", "teams"], True, False, False], + ['test_mid_child_selected', ["teams", "team_members"], False, True, False], + ['test_lowest_child_selected', ["teams", "team_members", "team_memberships"], False, False, True] ]) def test_stream_selection(self, name, expected_streams, is_parent, is_mid_child, is_child): """Test that if an only child or mid-child is selected in the catalog, then `get_stream_to_sync` returns the parent stream also""" diff --git a/tests/unittests/test_stream.py b/tests/unittests/test_stream.py index 97ef6999..7ad58c97 100644 --- a/tests/unittests/test_stream.py +++ b/tests/unittests/test_stream.py @@ -1,6 +1,6 @@ import unittest from unittest import mock -from tap_github.streams import Comments, ProjectColumns, Projects, Reviews, TeamMemberships, Teams, PullRequests, get_schema, get_child_full_url, get_bookmark +from tap_github.streams import Comments, Reviews, TeamMemberships, Teams, PullRequests, get_schema, get_child_full_url, get_bookmark from parameterized import parameterized @@ -20,41 +20,41 @@ def test_get_schema(self): # Verify returned schema is same as exected schema self.assertEqual(get_schema(catalog, "comments"), expected_schema) - -class TestGetBookmark(unittest.TestCase): - """ - Test `get_bookmark` method - """ - - test_stream = Comments() - - def test_without_stream_key(self): - """ - Test if the state does not contain a repo path - """ - state = { - "bookmarks": { - "org/test-repo": { - "projects" : {"since": "2022-01-01T00:00:00Z"} - } - } - } - returned_bookmark = get_bookmark(state, "org/test-repo", "projects", "since", "2021-01-01T00:00:00Z") - self.assertEqual(returned_bookmark, "2021-01-01T00:00:00Z") - - def test_with_streams_key(self): - """ - Test if the state does contains a repo path - """ - state = { - "bookmarks": { - "projects": { - "org/test-repo": {"since": "2022-01-01T00:00:00Z"} - } - } - } - returned_bookmark = get_bookmark(state, "org/test-repo", "projects", "since", "2021-01-01T00:00:00Z") - self.assertEqual(returned_bookmark, "2022-01-01T00:00:00Z") +# Projects parent and child streams were deprecated by Github. Test commented out 07/21/25 +# class TestGetBookmark(unittest.TestCase): +# """ +# Test `get_bookmark` method +# """ + +# test_stream = Comments() + +# def test_without_stream_key(self): +# """ +# Test if the state does not contain a repo path +# """ +# state = { +# "bookmarks": { +# "org/test-repo": { +# "projects" : {"since": "2022-01-01T00:00:00Z"} +# } +# } +# } +# returned_bookmark = get_bookmark(state, "org/test-repo", "projects", "since", "2021-01-01T00:00:00Z") +# self.assertEqual(returned_bookmark, "2021-01-01T00:00:00Z") + +# def test_with_streams_key(self): +# """ +# Test if the state does contains a repo path +# """ +# state = { +# "bookmarks": { +# "projects": { +# "org/test-repo": {"since": "2022-01-01T00:00:00Z"} +# } +# } +# } +# returned_bookmark = get_bookmark(state, "org/test-repo", "projects", "since", "2021-01-01T00:00:00Z") +# self.assertEqual(returned_bookmark, "2022-01-01T00:00:00Z") class TestBuildUrl(unittest.TestCase): """ @@ -84,9 +84,6 @@ class GetMinBookmark(unittest.TestCase): start_date = "2020-04-01T00:00:00Z" state = { "bookmarks": { - "projects": {"org/test-repo" : {"since": "2022-03-29T00:00:00Z"}}, - "project_columns": {"org/test-repo" : {"since": "2022-03-01T00:00:00Z"}}, - "project_cards": {"org/test-repo" : {"since": "2022-03-14T00:00:00Z"}}, "pull_requests": {"org/test-repo" : {"since": "2022-04-01T00:00:00Z"}}, "review_comments": {"org/test-repo" : {"since": "2022-03-01T00:00:00Z"}}, "pr_commits": {"org/test-repo" : {"since": "2022-02-01T00:00:00Z"}}, @@ -97,8 +94,8 @@ class GetMinBookmark(unittest.TestCase): @parameterized.expand([ ["test_multiple_children", PullRequests, "pull_requests", ["pull_requests","review_comments", "pr_commits"], "2022-04-01T00:00:00Z", "2022-02-01T00:00:00Z"], ["test_children_with_only_parent_selected", PullRequests, "pull_requests", ["pull_requests"], "2022-04-01T00:00:00Z", "2022-04-01T00:00:00Z"], - ["test_for_mid_child_in_stream", Projects, "projects", ["projects", "project_columns"], "2022-03-29T00:00:00Z", "2022-03-01T00:00:00Z"], - ["test_nested_child_bookmark", Projects, "projects", ["projects", "project_cards"], "2022-03-29T00:00:00Z", "2022-03-14T00:00:00Z"] + # ["test_for_mid_child_in_stream", Projects, "projects", ["projects", "project_columns"], "2022-03-29T00:00:00Z", "2022-03-01T00:00:00Z"], + # ["test_nested_child_bookmark", Projects, "projects", ["projects", "project_cards"], "2022-03-29T00:00:00Z", "2022-03-14T00:00:00Z"] ]) def test_multiple_children(self, name, stream_class, stream_name, stream_to_sync, current_date, expected_bookmark): """ @@ -120,9 +117,6 @@ class TestWriteBookmark(unittest.TestCase): state = { "bookmarks": { - "projects": {"org/test-repo" : {"since": "2021-03-29T00:00:00Z"}}, - "project_columns": {"org/test-repo" : {"since": "2021-03-01T00:00:00Z"}}, - "project_cards": {"org/test-repo" : {"since": "2021-03-14T00:00:00Z"}}, "pull_requests": {"org/test-repo" : {"since": "2021-04-01T00:00:00Z"}}, "review_comments": {"org/test-repo" : {"since": "2021-03-01T00:00:00Z"}}, "pr_commits": {"org/test-repo" : {"since": "2021-02-01T00:00:00Z"}}, @@ -151,18 +145,19 @@ def test_multiple_child(self, mock_write_bookmark): self.assertIn(mock_write_bookmark.mock_calls[1], expected_calls) self.assertIn(mock_write_bookmark.mock_calls[2], expected_calls) - def test_nested_child(self, mock_write_bookmark): - """ - Test for the stream if the nested child is selected - """ - test_stream = Projects() - test_stream.write_bookmarks("projects", ["project_cards"], - "2022-04-01T00:00:00Z", "org/test-repo", self.state) + # Projects parent and child streams were deprecated by Github. Test commented out 07/21/25 + # def test_nested_child(self, mock_write_bookmark): + # """ + # Test for the stream if the nested child is selected + # """ + # test_stream = Projects() + # test_stream.write_bookmarks("projects", ["project_cards"], + # "2022-04-01T00:00:00Z", "org/test-repo", self.state) - # Verify `write_bookmark` is called for all selected streams - self.assertEqual(mock_write_bookmark.call_count, 1) - mock_write_bookmark.assert_called_with(mock.ANY, "project_cards", - mock.ANY, {"since": "2022-04-01T00:00:00Z"}) + # # Verify `write_bookmark` is called for all selected streams + # self.assertEqual(mock_write_bookmark.call_count, 1) + # mock_write_bookmark.assert_called_with(mock.ANY, "project_cards", + # mock.ANY, {"since": "2022-04-01T00:00:00Z"}) class TestGetChildUrl(unittest.TestCase): @@ -172,7 +167,7 @@ class TestGetChildUrl(unittest.TestCase): domain = 'https://api.github.com' @parameterized.expand([ - ["test_child_stream", ProjectColumns, "org1/test-repo", "https://api.github.com/projects/1309875/columns", None, (1309875,)], + #["test_child_stream", ProjectColumns, "org1/test-repo", "https://api.github.com/projects/1309875/columns", None, (1309875,)], ["test_child_is_repository", Reviews, "org1/test-repo", "https://api.github.com/repos/org1/test-repo/pulls/11/reviews", (11,), None], ["test_child_is_organization", TeamMemberships, "org1", "https://api.github.com/orgs/org1/teams/dev-team/memberships/demo-user-1", ("dev-team",), ("demo-user-1",)] ]) diff --git a/tests/unittests/test_sync.py b/tests/unittests/test_sync.py index ef22b7f7..5f1e9c52 100644 --- a/tests/unittests/test_sync.py +++ b/tests/unittests/test_sync.py @@ -35,10 +35,7 @@ def test_sync_all_parents(self, mock_inc_ordered, mock_incremental, mock_write_s Test sync function with only all parents selected """ - mock_catalog = {"streams": [ - get_stream_catalog("projects", True), - get_stream_catalog("pull_requests", True) - ]} + mock_catalog = {"streams": [get_stream_catalog("pull_requests", True)]} client = mock.Mock() client.extract_repos_from_config.return_value = (["test-repo"], set()) @@ -48,10 +45,9 @@ def test_sync_all_parents(self, mock_inc_ordered, mock_incremental, mock_write_s sync(client, {'start_date': ""}, {}, mock_catalog) # Verify write schema is called for selected streams - self.assertEqual(mock_write_schemas.call_count, 2) + self.assertEqual(mock_write_schemas.call_count, 1) - self.assertEqual(mock_write_schemas.mock_calls[0], mock.call("projects", mock.ANY, mock.ANY)) - self.assertEqual(mock_write_schemas.mock_calls[1], mock.call("pull_requests", mock.ANY, mock.ANY)) + self.assertEqual(mock_write_schemas.mock_calls[0], mock.call("pull_requests", mock.ANY, mock.ANY)) @mock.patch("tap_github.streams.IncrementalOrderedStream.sync_endpoint") def test_sync_only_child(self, mock_inc_ordered, mock_incremental, mock_write_schemas, mock_write_state): @@ -60,9 +56,6 @@ def test_sync_only_child(self, mock_inc_ordered, mock_incremental, mock_write_sc """ mock_catalog = {"streams": [ - get_stream_catalog("projects"), - get_stream_catalog("project_columns"), - get_stream_catalog("project_cards", True), get_stream_catalog("pull_requests"), get_stream_catalog("review_comments", True) ]} @@ -75,10 +68,9 @@ def test_sync_only_child(self, mock_inc_ordered, mock_incremental, mock_write_sc sync(client, {'start_date': "2019-01-01T00:00:00Z"}, {}, mock_catalog) # Verify write schema is called for selected streams - self.assertEqual(mock_write_schemas.call_count, 2) + self.assertEqual(mock_write_schemas.call_count, 1) - self.assertEqual(mock_write_schemas.mock_calls[0], mock.call("projects", mock.ANY, mock.ANY)) - self.assertEqual(mock_write_schemas.mock_calls[1], mock.call("pull_requests", mock.ANY, mock.ANY)) + self.assertEqual(mock_write_schemas.mock_calls[0], mock.call("pull_requests", mock.ANY, mock.ANY)) @mock.patch("tap_github.streams.FullTableStream.sync_endpoint") def test_sync_only_mid_child(self, mock_full_table, mock_incremental, mock_write_schemas, mock_write_state): @@ -87,9 +79,6 @@ def test_sync_only_mid_child(self, mock_full_table, mock_incremental, mock_write """ mock_catalog = {"streams": [ - get_stream_catalog("projects"), - get_stream_catalog("project_columns", True), - get_stream_catalog("project_cards"), get_stream_catalog("teams"), get_stream_catalog("team_members", True), get_stream_catalog("team_memberships") @@ -103,15 +92,14 @@ def test_sync_only_mid_child(self, mock_full_table, mock_incremental, mock_write sync(client, {'start_date': ""}, {}, mock_catalog) # Verify write schema is called for selected streams - self.assertEqual(mock_write_schemas.call_count, 2) + self.assertEqual(mock_write_schemas.call_count, 1) self.assertEqual(mock_write_schemas.mock_calls[0], mock.call("teams", mock.ANY, mock.ANY)) - self.assertEqual(mock_write_schemas.mock_calls[1], mock.call("projects", mock.ANY, mock.ANY)) @mock.patch("tap_github.sync.get_stream_to_sync", return_value = []) @mock.patch("tap_github.sync.get_selected_streams", return_value = []) @mock.patch("tap_github.sync.update_currently_syncing_repo") - def test_no_streams_selected(self, mock_update_curr_sync, mock_selected_streams, mock_sync_streams, + def test_no_streams_selected(self, mock_update_curr_sync, mock_selected_streams, mock_sync_streams, mock_incremental, mock_write_schemas, mock_write_state): """ Test if no streams are selected then the state does not update, @@ -124,8 +112,6 @@ def test_no_streams_selected(self, mock_update_curr_sync, mock_selected_streams, "currently_syncing": "teams" } mock_catalog = {"streams": [ - get_stream_catalog("projects"), - get_stream_catalog("project_columns", True), get_stream_catalog("teams"), get_stream_catalog("team_members", True) ]} @@ -145,24 +131,24 @@ def test_no_streams_selected(self, mock_update_curr_sync, mock_selected_streams, # Verify updated_currently_syncing_repo was not called self.assertFalse(mock_update_curr_sync.called) +# Projects parent and child streams were deprecated by Github. Test commented out 07/21/25 +# @mock.patch("singer.write_schema") +# class TestWriteSchemas(unittest.TestCase): -@mock.patch("singer.write_schema") -class TestWriteSchemas(unittest.TestCase): - - mock_catalog = {"streams": [ - get_stream_catalog("projects"), - get_stream_catalog("project_columns"), - get_stream_catalog("project_cards") - ]} +# mock_catalog = {"streams": [ +# get_stream_catalog("projects"), +# get_stream_catalog("project_columns"), +# get_stream_catalog("project_cards") +# ]} - def test_parents_selected(self, mock_write_schema): - write_schemas("projects", self.mock_catalog, ["projects"]) - mock_write_schema.assert_called_with("projects", mock.ANY, mock.ANY) +# def test_parents_selected(self, mock_write_schema): +# write_schemas("projects", self.mock_catalog, ["projects"]) +# mock_write_schema.assert_called_with("projects", mock.ANY, mock.ANY) - def test_mid_child_selected(self, mock_write_schema): - write_schemas("project_columns", self.mock_catalog, ["project_columns"]) - mock_write_schema.assert_called_with("project_columns", mock.ANY, mock.ANY) +# def test_mid_child_selected(self, mock_write_schema): +# write_schemas("project_columns", self.mock_catalog, ["project_columns"]) +# mock_write_schema.assert_called_with("project_columns", mock.ANY, mock.ANY) - def test_nested_child_selected(self, mock_write_schema): - write_schemas("project_cards", self.mock_catalog, ["project_cards"]) - mock_write_schema.assert_called_with("project_cards", mock.ANY, mock.ANY) +# def test_nested_child_selected(self, mock_write_schema): +# write_schemas("project_cards", self.mock_catalog, ["project_cards"]) +# mock_write_schema.assert_called_with("project_cards", mock.ANY, mock.ANY) diff --git a/tests/unittests/test_sync_endpoint.py b/tests/unittests/test_sync_endpoint.py index 2ed297de..b7a8dfe4 100644 --- a/tests/unittests/test_sync_endpoint.py +++ b/tests/unittests/test_sync_endpoint.py @@ -1,7 +1,7 @@ import unittest from unittest import mock from tap_github.client import GithubClient -from tap_github.streams import Commits, Events, Projects, PullRequests, StarGazers, Teams +from tap_github.streams import Commits, Events, PullRequests, StarGazers, Teams class MockResponse(): """Mock response object class.""" @@ -171,51 +171,52 @@ def test_without_child_stream(self, mock_get_child_records, mock_authed_get_all_ # Verify that the get_child_records() is not called as Commits does not contain any child stream. self.assertFalse(mock_get_child_records.called) - @mock.patch("tap_github.streams.Stream.get_child_records") - def test_with_child_streams(self, mock_get_child_records, mock_authed_get_all_pages, mock_verify_access, mock_get_schema): - """Verify that get_child_records() is called for streams with child streams""" - test_client = GithubClient(self.config) - test_stream = Projects() - mock_get_schema.return_value = self.catalog + # Projects parent and child streams were deprecated by Github. Test commented out 07/21/25 + # @mock.patch("tap_github.streams.Stream.get_child_records") + # def test_with_child_streams(self, mock_get_child_records, mock_authed_get_all_pages, mock_verify_access, mock_get_schema): + # """Verify that get_child_records() is called for streams with child streams""" + # test_client = GithubClient(self.config) + # test_stream = Projects() + # mock_get_schema.return_value = self.catalog - mock_authed_get_all_pages.return_value = [MockResponse([{"id": 1, "updated_at": "2022-07-05T09:42:14.000000Z"}, {"id": 1, "updated_at": "2022-07-06T09:42:14.000000Z"}]), - MockResponse([{"id": 1, "updated_at": "2022-07-07T09:42:14.000000Z"}, {"id": 1, "updated_at": "2022-07-08T09:42:14.000000Z"}])] + # mock_authed_get_all_pages.return_value = [MockResponse([{"id": 1, "updated_at": "2022-07-05T09:42:14.000000Z"}, {"id": 1, "updated_at": "2022-07-06T09:42:14.000000Z"}]), + # MockResponse([{"id": 1, "updated_at": "2022-07-07T09:42:14.000000Z"}, {"id": 1, "updated_at": "2022-07-08T09:42:14.000000Z"}])] - test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["projects", "project_columns"], ["projects","project_columns"]) + # test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["projects", "project_columns"], ["projects","project_columns"]) - # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects') - - # Verify that the get_child_records() is called as thw Projects stream has a child stream - self.assertTrue(mock_get_child_records.called) + # # Verify that the authed_get_all_pages() is called with the expected url + # mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects') - def test_with_nested_child_streams(self, mock_authed_get_all_pages, mock_verify_access, mock_get_schema): - """Verify that get_child_records() is called for streams with child streams and calls authed_get_all_pages() is called as expected""" - test_client = GithubClient(self.config) - test_stream = Projects() - mock_get_schema.return_value = self.catalog + # # Verify that the get_child_records() is called as thw Projects stream has a child stream + # self.assertTrue(mock_get_child_records.called) - mock_authed_get_all_pages.side_effect = [ - [MockResponse([{"id": 1, "updated_at": "2022-07-05T09:42:14.000000Z"}])], - [MockResponse([{"id": 1}, {"id": 2}])], - [MockResponse({"id": 1})], - [MockResponse({"id": 2})], - [], [] - ] + # def test_with_nested_child_streams(self, mock_authed_get_all_pages, mock_verify_access, mock_get_schema): + # """Verify that get_child_records() is called for streams with child streams and calls authed_get_all_pages() is called as expected""" + # test_client = GithubClient(self.config) + # test_stream = Projects() + # mock_get_schema.return_value = self.catalog - test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["projects", "project_columns", "project_cards"], ["projects","project_columns", "project_cards"]) + # mock_authed_get_all_pages.side_effect = [ + # [MockResponse([{"id": 1, "updated_at": "2022-07-05T09:42:14.000000Z"}])], + # [MockResponse([{"id": 1}, {"id": 2}])], + # [MockResponse({"id": 1})], + # [MockResponse({"id": 2})], + # [], [] + # ] - # Verify that the authed_get_all_pages() is called expected number of times - self.assertEqual(mock_authed_get_all_pages.call_count, 4) - - exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects') - exp_call_2 = mock.call(mock.ANY, "https://api.github.com/projects/1/columns", stream='project_columns') - exp_call_3 = mock.call(mock.ANY, "https://api.github.com/projects/columns/1/cards", stream='project_cards') + # test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["projects", "project_columns", "project_cards"], ["projects","project_columns", "project_cards"]) - # Verify that the API calls are done as expected with the correct url - self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) - self.assertEqual(mock_authed_get_all_pages.mock_calls[1], exp_call_2) - self.assertEqual(mock_authed_get_all_pages.mock_calls[2], exp_call_3) + # # Verify that the authed_get_all_pages() is called expected number of times + # self.assertEqual(mock_authed_get_all_pages.call_count, 4) + + # exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects') + # exp_call_2 = mock.call(mock.ANY, "https://api.github.com/projects/1/columns", stream='project_columns') + # exp_call_3 = mock.call(mock.ANY, "https://api.github.com/projects/columns/1/cards", stream='project_cards') + + # # Verify that the API calls are done as expected with the correct url + # self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) + # self.assertEqual(mock_authed_get_all_pages.mock_calls[1], exp_call_2) + # self.assertEqual(mock_authed_get_all_pages.mock_calls[2], exp_call_3) @mock.patch("tap_github.streams.get_schema") @mock.patch("tap_github.client.GithubClient.verify_access_for_repo", return_value = None) From 3d1aa762e71f9434eb8213a8c6bbdd7dd65a6b11 Mon Sep 17 00:00:00 2001 From: Dylan Sprayberry <28106103+dsprayberry@users.noreply.github.com> Date: Thu, 14 Aug 2025 09:56:29 -0400 Subject: [PATCH 22/30] Use uv (#219) Co-authored-by: Andy Lu Co-authored-by: Ben Allred --- .circleci/config.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 03f71d0a..55b7e1d7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -5,16 +5,16 @@ orbs: jobs: build: docker: - - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:stitch-tap-tester + - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:stitch-tap-tester-uv steps: - checkout - run: name: 'Setup virtual env' command: | - python3 -mvenv /usr/local/share/virtualenvs/tap-github + uv venv --python 3.9 /usr/local/share/virtualenvs/tap-github source /usr/local/share/virtualenvs/tap-github/bin/activate - pip install -U 'pip<19.2' 'setuptools<51.0.0' - pip install .[dev] + uv pip install -U pip setuptools + uv pip install .[dev] - run: name: 'JSON Validator' command: | @@ -29,8 +29,8 @@ jobs: name: 'Unit Tests' command: | source /usr/local/share/virtualenvs/tap-github/bin/activate - pip install nose coverage parameterized - nosetests --with-coverage --cover-erase --cover-package=tap_github --cover-html-dir=htmlcov tests/unittests + uv pip install pytest coverage parameterized + coverage run -m pytest tests/unittests coverage html when: always - store_test_results: @@ -40,9 +40,10 @@ jobs: - run: name: 'Integration Tests' command: | + source /usr/local/share/virtualenvs/tap-tester/bin/activate + uv pip install --upgrade awscli aws s3 cp s3://com-stitchdata-dev-deployment-assets/environments/tap-tester/tap_tester_sandbox dev_env.sh source dev_env.sh - source /usr/local/share/virtualenvs/tap-tester/bin/activate run-test --tap=tap-github tests when: always - slack/notify-on-failure: From 2ca12d6050a88f013588ec907a3b9dd5bda7f657 Mon Sep 17 00:00:00 2001 From: Scott Nakano <97182596+ScottNTalend@users.noreply.github.com> Date: Thu, 21 Aug 2025 11:09:48 -0700 Subject: [PATCH 23/30] SAC-28465/add-parent-tap-stream-id (#220) * feat: add parent-tap-stream-id to metadata ----------------------------- Co-authored-by: Scott Nakano * feat: version bump 3.3.0 and changelog update ----------------------------- Co-authored-by: Scott Nakano --------- Co-authored-by: Leslie VanDeMark --- CHANGELOG.md | 3 +++ setup.py | 2 +- tap_github/schema.py | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2be35d86..4727159a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# 3.3.0 + * Adds `forced_replication_method` and `parent_tap_stream_id` as discoverable metadata [#220](https://github.com/singer-io/tap-github/pull/220) + # 3.2.0 * Removes the `Projects`, `ProjectCards`, and `ProjectColumns` streams as they've been sunset by Github [#218](https://github.com/singer-io/tap-github/pull/218) diff --git a/setup.py b/setup.py index c9ef9c78..e957820c 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='3.2.0', + version='3.3.0', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', diff --git a/tap_github/schema.py b/tap_github/schema.py index 6b65176c..e26b86d2 100644 --- a/tap_github/schema.py +++ b/tap_github/schema.py @@ -51,6 +51,9 @@ def get_schemas(): replication_method = (hasattr(stream_metadata, 'replication_method') or None) and stream_metadata.replication_method ) mdata = metadata.to_map(mdata) + if stream_metadata.parent is not None: + mdata = metadata.write(mdata, (), 'parent-tap-stream-id', stream_metadata.parent) + # Loop through all keys and make replication keys and primary keys of child stream which are not automatic in parent stream of automatic inclusion for field_name in schema['properties'].keys(): From f6ad80d8b5094cfcebd0113f841c11c1a7b40385 Mon Sep 17 00:00:00 2001 From: Bryant Gray Date: Wed, 3 Sep 2025 14:40:13 -0400 Subject: [PATCH 24/30] Use python 3.12 in circleci (#221) * Use python 3.12 in circleci * assertEquals -> assertEqual * unpin pylint * make pylint happy --- .circleci/config.yml | 6 +++--- CHANGELOG.md | 3 +++ setup.py | 4 ++-- tap_github/client.py | 2 +- tap_github/streams.py | 1 - tests/unittests/test_exception_handling.py | 2 +- 6 files changed, 10 insertions(+), 8 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 55b7e1d7..0e334e8a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -11,9 +11,9 @@ jobs: - run: name: 'Setup virtual env' command: | - uv venv --python 3.9 /usr/local/share/virtualenvs/tap-github + uv venv --python 3.12 /usr/local/share/virtualenvs/tap-github source /usr/local/share/virtualenvs/tap-github/bin/activate - uv pip install -U pip setuptools + uv pip install -U setuptools uv pip install .[dev] - run: name: 'JSON Validator' @@ -24,7 +24,7 @@ jobs: name: 'pylint' command: | source /usr/local/share/virtualenvs/tap-github/bin/activate - pylint tap_github --disable 'missing-module-docstring,missing-function-docstring,missing-class-docstring,line-too-long,invalid-name,too-many-lines,consider-using-f-string,too-many-arguments,too-many-locals' + pylint tap_github --disable 'missing-module-docstring,missing-function-docstring,missing-class-docstring,line-too-long,invalid-name,too-many-lines,consider-using-f-string,too-many-arguments,too-many-positional-arguments,too-many-locals,unnecessary-lambda-assignment,unspecified-encoding' - run: name: 'Unit Tests' command: | diff --git a/CHANGELOG.md b/CHANGELOG.md index 4727159a..138ecbea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# 3.3.1 + * Fix new pylint issues [#221](https://github.com/singer-io/tap-github/pull/221) + # 3.3.0 * Adds `forced_replication_method` and `parent_tap_stream_id` as discoverable metadata [#220](https://github.com/singer-io/tap-github/pull/220) diff --git a/setup.py b/setup.py index e957820c..05e7da6e 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='3.3.0', + version='3.3.1', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', @@ -16,7 +16,7 @@ ], extras_require={ 'dev': [ - 'pylint==2.6.2', + 'pylint', 'ipdb', 'nose', 'requests-mock==1.9.3' diff --git a/tap_github/client.py b/tap_github/client.py index 9e846282..90a01568 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -277,7 +277,7 @@ def extract_repos_from_config(self): unique_repos = set() # Insert the duplicate repos found in the config repo_paths into duplicates - duplicate_repos = [x for x in repo_paths if x in unique_repos or (unique_repos.add(x) or False)] + duplicate_repos = [x for x in repo_paths if x in unique_repos or (unique_repos.add(x))] if duplicate_repos: LOGGER.warning("Duplicate repositories found: %s and will be synced only once.", duplicate_repos) diff --git a/tap_github/streams.py b/tap_github/streams.py index f29380fe..707898ef 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -125,7 +125,6 @@ def write_bookmarks(self, stream, selected_streams, bookmark_value, repo_path, s for child in stream_obj.children: self.write_bookmarks(child, selected_streams, bookmark_value, repo_path, state) - # pylint: disable=no-self-use def get_child_records(self, client, catalog, diff --git a/tests/unittests/test_exception_handling.py b/tests/unittests/test_exception_handling.py index 8c381054..51e1152d 100644 --- a/tests/unittests/test_exception_handling.py +++ b/tests/unittests/test_exception_handling.py @@ -89,7 +89,7 @@ def test_error_message_and_call_count(self, mocked_parse_args, mocked_request, m self.assertEqual(str(e.exception), expected_error_message) # Verify the call count for each error. - self.assertEquals(call_count, mocked_request.call_count) + self.assertEqual(call_count, mocked_request.call_count) @mock.patch("tap_github.client.LOGGER.warning") def test_skip_404_error(self, mock_logger, mocked_parse_args, mocked_request, mock_verify_access, mock_sleep): From 7241a86508e006c1ba97837d1b260d8f8fd23294 Mon Sep 17 00:00:00 2001 From: Bryant Gray Date: Tue, 23 Sep 2025 16:15:59 -0400 Subject: [PATCH 25/30] Check for bad creds before checking for rate limit (#222) --- tap_github/client.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tap_github/client.py b/tap_github/client.py index 90a01568..4f04442a 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -204,6 +204,10 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True) with metrics.http_request_timer(source) as timer: self.session.headers.update(headers) resp = self.session.request(method='get', url=url, timeout=self.get_request_timeout()) + # Check for bad creds before checking rate throttling because a bad + # creds response does not include rate limit headers + if resp.status_code == 401: + raise_for_error(resp, source, stream, self, should_skip_404) if rate_throttling(resp): # If the API rate limit is reached, the function will be recursively self.authed_get(source, url, headers, stream, should_skip_404) From 67bad8215516af8485a172e7f4b3351f1e4c8651 Mon Sep 17 00:00:00 2001 From: Bryant Gray Date: Wed, 24 Sep 2025 09:50:13 -0400 Subject: [PATCH 26/30] bump version to v3.3.2 and update changelog (#223) --- CHANGELOG.md | 3 +++ setup.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 138ecbea..97d18c9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# 3.3.2 + * Fix check for bad credentials [#222](https://github.com/singer-io/tap-github/pull/222) + # 3.3.1 * Fix new pylint issues [#221](https://github.com/singer-io/tap-github/pull/221) diff --git a/setup.py b/setup.py index 05e7da6e..a00a40ce 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='3.3.1', + version='3.3.2', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', From 396a4a1f8eeadce10280ea8b57c2b73556f353d6 Mon Sep 17 00:00:00 2001 From: Andy Lu Date: Thu, 4 Dec 2025 10:15:51 -0500 Subject: [PATCH 27/30] SAC-28616: Fix bug in `translate_state` (#224) * Add tests and prove there's a bug in `translate_state` Co-authored-by: Daniel Bohan * add test 1/8 * add test 2/8 * add test 3/8 * add test 4/8 * add test 5/8 * add test 6/8 * add test 7/8 * add test 8/8 * Clarify doc string * Fix `test__old_style__stream_not_in_catalog__repo_in_state` If `key in repositories` is true, we know we've found a repo in the wrong layer of the old bookmark. So we should translate it and reshape it * translate_state should only touch `bookmarks`, don't wipe away other things --------- Co-authored-by: Daniel Bohan --- tap_github/sync.py | 16 +- tests/unittests/test_sync.py | 386 ++++++++++++++++++++++++++++++++++- 2 files changed, 396 insertions(+), 6 deletions(-) diff --git a/tap_github/sync.py b/tap_github/sync.py index 961f17f6..1e4edea7 100644 --- a/tap_github/sync.py +++ b/tap_github/sync.py @@ -119,19 +119,25 @@ def translate_state(state, catalog, repositories): for key in previous_state_keys: # Loop through each key of `bookmarks` available in the previous state. - for inner_key in state['bookmarks'][key].keys(): - if inner_key not in stream_names and inner_key not in repositories: - # Return the existing state if all repos from the previous state are deselected(not found) in the current sync. - return state + for inner_key, inner_value in state['bookmarks'][key].items(): + if inner_key in stream_names or key in repositories: + new_state['bookmarks'][inner_key][key] = inner_value + else: + new_state['bookmarks'][key][inner_key] = inner_value for stream in catalog['streams']: stream_name = stream['tap_stream_id'] for repo in repositories: if bookmarks.get_bookmark(state, stream_name, repo): - return state + new_state['bookmarks'][stream_name][repo] = bookmarks.get_bookmark(state, stream_name, repo) if bookmarks.get_bookmark(state, repo, stream_name): new_state['bookmarks'][stream_name][repo] = bookmarks.get_bookmark(state, repo, stream_name) + # Preserve other key-value pairs in state + for key, value in state.items(): + if key != "bookmarks": + new_state[key] = value + return new_state def get_stream_to_sync(catalog): diff --git a/tests/unittests/test_sync.py b/tests/unittests/test_sync.py index 5f1e9c52..594b8732 100644 --- a/tests/unittests/test_sync.py +++ b/tests/unittests/test_sync.py @@ -1,6 +1,6 @@ import unittest from unittest import mock -from tap_github.sync import sync, write_schemas +from tap_github.sync import sync, write_schemas, translate_state @@ -152,3 +152,387 @@ def test_no_streams_selected(self, mock_update_curr_sync, mock_selected_streams, # def test_nested_child_selected(self, mock_write_schema): # write_schemas("project_cards", self.mock_catalog, ["project_cards"]) # mock_write_schema.assert_called_with("project_cards", mock.ANY, mock.ANY) + + +class TestTranslateState(unittest.TestCase): + """Tests for `translate_state` + + There are many combinations of test cases due to: + - 2 versions of the state structure + - "Old style" repo-stream + - "New style" stream-repo + - 2 possibilities of a stream being in/not-in catalog + - stream in catalog + - stream not in catalog + - 2 possibilities of a repo being in/not-in state + - repo in state + - repo not in state + """ + + def test_repo_stream_state_is_translated(self): + state = { + "bookmarks": { + "singer-io/tap-adwords": { + "commits": { + "since": "2018-11-14T13:21:20.700360Z" + } + }, + "singer-io/tap-salesforce": { + "commits": { + "since": "2018-11-14T13:21:20.700360Z" + } + } + } + } + + catalog = {"streams": [{"tap_stream_id": "commits"}]} + repos = ["singer-io/tap-adwords"] + + actual = translate_state(state, catalog, repos) + expected = { + "bookmarks": { + "commits": { + "singer-io/tap-adwords": { + "since": "2018-11-14T13:21:20.700360Z" + }, + "singer-io/tap-salesforce": { + "since": "2018-11-14T13:21:20.700360Z" + } + } + } + } + + assert actual == expected + + def test_stream_repo_state_is_not_translated(self): + state = { + "bookmarks": { + "commits": { + "singer-io/tap-adwords": { + "since": "2018-11-14T13:21:20.700360Z" + }, + "singer-io/tap-salesforce": { + "since": "2018-11-14T13:21:20.700360Z" + } + } + } + } + catalog = {"streams": [{"tap_stream_id": "commits"}]} + repos = ["singer-io/tap-adwords"] + + actual = translate_state(state, catalog, repos) + expected = { + "bookmarks": { + "commits": { + "singer-io/tap-adwords": { + "since": "2018-11-14T13:21:20.700360Z" + }, + "singer-io/tap-salesforce": { + "since": "2018-11-14T13:21:20.700360Z" + } + } + } + } + + assert actual == expected + + def test_stream_repo_state_and_not_selected_is_not_translated(self): + state = { + "bookmarks": { + "commits": { + "singer-io/tap-adwords": { + "since": "2018-11-14T13:21:20.700360Z" + }, + "singer-io/tap-salesforce": { + "since": "2018-11-14T13:21:20.700360Z" + } + } + } + } + catalog = {"streams": [{"tap_stream_id": "issues"}]} + repos = ["singer-io/tap-adwords"] + + actual = translate_state(state, catalog, repos) + expected = { + "bookmarks": { + "commits": { + "singer-io/tap-adwords": { + "since": "2018-11-14T13:21:20.700360Z" + }, + "singer-io/tap-salesforce": { + "since": "2018-11-14T13:21:20.700360Z" + } + } + } + } + assert actual == expected + + def test_real_sceanario(self): + state = { + "bookmarks": { + "issue_events": { + "singer-io/tap-github": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + + catalog = {"streams": [{"tap_stream_id": "commits"}]} + repos = ["singer-io/tap-github"] + actual = translate_state(state, catalog, repos) + expected = { + "bookmarks": { + "issue_events": { + "singer-io/tap-github": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + assert actual == expected + + def test__old_style__stream_in_catalog__repo_in_state(self): + """ + We have a bookmark and know that the repo is in the wrong layer + and the stream is in the wrong layer. This means we should + translate the shape + """ + state = { + "bookmarks": { + "singer-io/tap-fake-repo": { + "fake_stream": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + + catalog = {"streams": [{"tap_stream_id": "fake_stream"}]} + repos = ["singer-io/tap-fake-repo"] + actual = translate_state(state, catalog, repos) + expected = { + "bookmarks": { + "fake_stream": { + "singer-io/tap-fake-repo": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + assert actual == expected + + def test__old_style__stream_in_catalog__repo_not_in_state(self): + """ + We have a bookmark and know that the stream is in the wrong + layer. We have to assume the unknown layer is a repo. This + means we should translate the shape + """ + + state = { + "bookmarks": { + "singer-io/tap-fake-repo-a": { + "fake_stream": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + + catalog = {"streams": [{"tap_stream_id": "fake_stream"}]} + repos = ["singer-io/tap-fake-repo-b"] + actual = translate_state(state, catalog, repos) + expected = { + "bookmarks": { + "fake_stream": { + "singer-io/tap-fake-repo-a": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + assert actual == expected + + def test__old_style__stream_not_in_catalog__repo_in_state(self): + """ + We have a bookmark and know that the repo is in the wrong + layer. We have to assume the unknown layer is a stream. This + means we should translate the shape + """ + + state = { + "bookmarks": { + "singer-io/tap-fake-repo": { + "fake_stream_a": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + + catalog = {"streams": [{"tap_stream_id": "fake_stream_b"}]} + repos = ["singer-io/tap-fake-repo"] + actual = translate_state(state, catalog, repos) + expected = { + "bookmarks": { + "fake_stream_a": { + "singer-io/tap-fake-repo": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + assert actual == expected + + def test__old_style__stream_not_in_catalog__repo_not_in_state(self): + """ + We have a bookmark and don't know anything about the two + layers. This means we should not translate the shape + """ + + state = { + "bookmarks": { + "singer-io/tap-fake-repo-a": { + "fake_stream_a": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + + catalog = {"streams": [{"tap_stream_id": "fake_stream_b"}]} + repos = ["singer-io/tap-fake-repo-b"] + actual = translate_state(state, catalog, repos) + expected = { + "bookmarks": { + "singer-io/tap-fake-repo-a": { + "fake_stream_a": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + assert actual == expected + + def test__new_style__stream_in_catalog__repo_in_state(self): + """ + We have a bookmark and know that the repo is in the right + layer and the stream is in the right layer. This means we + should not translate the shape + """ + + state = { + "bookmarks": { + "fake_stream": { + "singer-io/tap-fake-repo": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + + catalog = {"streams": [{"tap_stream_id": "fake_stream"}]} + repos = ["singer-io/tap-fake-repo"] + actual = translate_state(state, catalog, repos) + expected = { + "bookmarks": { + "fake_stream": { + "singer-io/tap-fake-repo": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + assert actual == expected + + def test__new_style___stream_in_catalog__repo_not_state(self): + """ + We have a bookmark and know that the stream is in the right + layer. We assume the unknown layer is a repo. This means we + should not translate the shape + """ + + state = { + "bookmarks": { + "fake_stream": { + "singer-io/tap-fake-repo-a": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + + catalog = {"streams": [{"tap_stream_id": "fake_stream"}]} + repos = ["singer-io/tap-fake-repo-b"] + actual = translate_state(state, catalog, repos) + expected = { + "bookmarks": { + "fake_stream": { + "singer-io/tap-fake-repo-a": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + assert actual == expected + + def test__new_style__stream_not_in_catalog__repo_in_state(self): + """ + We have a bookmark and know that the repo is in the right + layer. We assume the unknown layer is a stream. This means we + should not translate the shape + """ + + state = { + "bookmarks": { + "fake_stream_a": { + "singer-io/tap-fake-repo": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + + catalog = {"streams": [{"tap_stream_id": "fake_stream_b"}]} + repos = ["singer-io/tap-fake-repo"] + actual = translate_state(state, catalog, repos) + expected = { + "bookmarks": { + "fake_stream_a": { + "singer-io/tap-fake-repo": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + assert actual == expected + + def test__new_style__stream_not_in_catalog__repo_not_in_state(self): + """ + We have a bookmark and don't know anything about the two + layers. This means we should not translate the shape + """ + + state = { + "bookmarks": { + "fake_stream_a": { + "singer-io/tap-fake-repo-a": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + + catalog = {"streams": [{"tap_stream_id": "fake_stream_b"}]} + repos = ["singer-io/tap-fake-repo-b"] + actual = translate_state(state, catalog, repos) + expected = { + "bookmarks": { + "fake_stream_a": { + "singer-io/tap-fake-repo-a": { + "since": "2025-09-24T13:50:18Z" + } + } + } + } + assert actual == expected From 08f2724bb65b3d406cc50908e3f69d4ebe51cbfa Mon Sep 17 00:00:00 2001 From: Andy Lu Date: Wed, 10 Dec 2025 15:21:34 -0500 Subject: [PATCH 28/30] Bump version, update changelog (#226) --- CHANGELOG.md | 4 ++++ setup.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 97d18c9d..8edf6230 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +# 3.3.3 + * Fix `translate_state` to not delete state [#224](https://github.com/singer-io/tap-github/pull/224) + * Add unit tests to prove there's a bug and prevent future regression + # 3.3.2 * Fix check for bad credentials [#222](https://github.com/singer-io/tap-github/pull/222) diff --git a/setup.py b/setup.py index a00a40ce..9d7ba465 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='3.3.2', + version='3.3.3', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', From a3d6140c4eea7e01f01c0a7608ea07e82b44737d Mon Sep 17 00:00:00 2001 From: Leslie VanDeMark <38043390+leslievandemark@users.noreply.github.com> Date: Wed, 13 May 2026 09:13:44 -0400 Subject: [PATCH 29/30] fix malformed URL causing 'since' filter to be dropped for Issues and Comments streams (#230) * fix malformed url builder, ? was causing since clause to be dropped * make pylint happy * another unit test, version bump and changelog update * bump requests to 2.33.0 for dependabot --- CHANGELOG.md | 3 +++ setup.py | 4 ++-- tap_github/streams.py | 6 ++++-- tests/unittests/test_stream.py | 19 ++++++++++++++++--- 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8edf6230..c6f5d5bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# 3.3.4 + * Fix malformed URL causing 'since' filter to be dropped for Issues and Comments streams [#230](https://github.com/singer-io/tap-github/pull/230) + # 3.3.3 * Fix `translate_state` to not delete state [#224](https://github.com/singer-io/tap-github/pull/224) * Add unit tests to prove there's a bug and prevent future regression diff --git a/setup.py b/setup.py index 9d7ba465..9948a3ed 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='3.3.3', + version='3.3.4', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', @@ -11,7 +11,7 @@ py_modules=['tap_github'], install_requires=[ 'singer-python==5.12.1', - 'requests==2.32.4', + 'requests==2.33.0', 'backoff==1.8.0' ], extras_require={ diff --git a/tap_github/streams.py b/tap_github/streams.py index 707898ef..7eda4605 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -74,8 +74,9 @@ def build_url(self, base_url, repo_path, bookmark): Build the full url with parameters and attributes. """ if self.filter_param: - # Add the since parameter for incremental streams - query_string = '?since={}'.format(bookmark) + # Use '&' if the path already contains a query string, otherwise '?' + separator = '&' if '?' in (self.path or '') else '?' + query_string = '{}since={}'.format(separator, bookmark) else: query_string = '' @@ -387,6 +388,7 @@ def sync_endpoint(self, ): records = response.json() extraction_time = singer.utils.now() + for record in records: record['_sdc_repository'] = repo_path self.add_fields_at_1st_level(record = record, parent_record = None) diff --git a/tests/unittests/test_stream.py b/tests/unittests/test_stream.py index 7ad58c97..5641d6ff 100644 --- a/tests/unittests/test_stream.py +++ b/tests/unittests/test_stream.py @@ -1,6 +1,6 @@ import unittest from unittest import mock -from tap_github.streams import Comments, Reviews, TeamMemberships, Teams, PullRequests, get_schema, get_child_full_url, get_bookmark +from tap_github.streams import Comments, Reviews, TeamMemberships, Teams, PullRequests, Commits, Issues, get_schema, get_child_full_url, get_bookmark from parameterized import parameterized @@ -62,8 +62,21 @@ class TestBuildUrl(unittest.TestCase): """ @parameterized.expand([ - ["test_stream_with_filter_params", "org/test-repo", "https://api.github.com/repos/org/test-repo/issues/comments?sort=updated&direction=desc?since=2022-01-01T00:00:00Z", Comments], - ["test_stream_with_organization", "org", "https://api.github.com/orgs/org/teams", Teams] + # Path already has '?' — since must be appended with '&' + ["comments_appends_ampersand_since", "org/test-repo", + "https://api.github.com/repos/org/test-repo/issues/comments?sort=updated&direction=desc&since=2022-01-01T00:00:00Z", + Comments], + ["issues_appends_ampersand_since", "org/test-repo", + "https://api.github.com/repos/org/test-repo/issues?state=all&sort=updated&direction=desc&since=2022-01-01T00:00:00Z", + Issues], + # Path has no '?' — since must be appended with '?' + ["commits_appends_question_mark_since", "org/test-repo", + "https://api.github.com/repos/org/test-repo/commits?since=2022-01-01T00:00:00Z", + Commits], + # Org-level stream with no filter_param — no since appended + ["teams_org_url_no_since", "org", + "https://api.github.com/orgs/org/teams", + Teams], ]) def test_build_url(self, name, param, expected_url, stream_class): """ From a0f6f6c824af117e003723bd7f9196cbd6a804c8 Mon Sep 17 00:00:00 2001 From: Mukesh Bhatt Date: Wed, 3 Jun 2026 11:10:18 +0530 Subject: [PATCH 30/30] remove streams from catalog if not have permission and bump version (#229) * code for remove stream from catalog if do not have permission * remove collaborators from integration tests * Address PR review comments: fix double extract_repos_from_config, sort repos, derive org from repo_path, broaden GithubException catch, fix test side_effect * adding back * update tests * resove review comments * resolve review comments * update changelog * update changelog * update changelog --- CHANGELOG.md | 4 ++ setup.py | 8 ++-- tap_github/client.py | 19 ++++++++- tap_github/discover.py | 74 ++++++++++++++++++++++++++++++++- tests/base.py | 5 ++- tests/test_github_all_fields.py | 8 ++-- tests/unittests/test_main.py | 30 +++++++------ 7 files changed, 121 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c6f5d5bd..9befc927 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +# 3.4.0 + * Exclude 403-forbidden streams from discovery [#229](https://github.com/singer-io/tap-github/pull/229) + * Bump dependencies for compliance + # 3.3.4 * Fix malformed URL causing 'since' filter to be dropped for Issues and Comments streams [#230](https://github.com/singer-io/tap-github/pull/230) diff --git a/setup.py b/setup.py index 9948a3ed..7c5f0a51 100644 --- a/setup.py +++ b/setup.py @@ -3,16 +3,16 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='3.3.4', + version='3.4.0', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', classifiers=['Programming Language :: Python :: 3 :: Only'], py_modules=['tap_github'], install_requires=[ - 'singer-python==5.12.1', - 'requests==2.33.0', - 'backoff==1.8.0' + 'singer-python==6.8.0', + 'requests==2.34.2', + 'backoff==2.2.1' ], extras_require={ 'dev': [ diff --git a/tap_github/client.py b/tap_github/client.py index 4f04442a..3198bbe6 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -249,11 +249,13 @@ def verify_repo_access(self, url_for_repo, repo): message = "HTTP-error-code: 404, Error: Please check the repository name \'{}\' or you do not have sufficient permissions to access this repository.".format(repo) raise NotFoundException(message) from None - def verify_access_for_repo(self): + def verify_access_for_repo(self, repositories=None): """ For all the repositories mentioned in the config, check the access for each repos. + Accepts an optional precomputed list of repositories to avoid redundant API calls. """ - repositories, org = self.extract_repos_from_config() # pylint: disable=unused-variable + if repositories is None: + repositories, _ = self.extract_repos_from_config() for repo in repositories: @@ -263,6 +265,19 @@ def verify_access_for_repo(self): # Verifying for Repo access self.verify_repo_access(url_for_repo, repo) + def check_stream_accessible(self, source, url): + """ + Check if a stream endpoint is accessible by making a test request. + Returns True if accessible (HTTP 200), False if permission is denied (403) + or the resource is not found (404). + """ + try: + self.authed_get(source, url, should_skip_404=False) + return True + except GithubException as e: + LOGGER.warning("Stream '%s' is not accessible: %s", source, str(e)) + return False + def extract_orgs_from_config(self): """ Extracts all organizations from the config diff --git a/tap_github/discover.py b/tap_github/discover.py index b39449e5..e05e9883 100644 --- a/tap_github/discover.py +++ b/tap_github/discover.py @@ -2,20 +2,90 @@ from singer import metadata from singer.catalog import Catalog, CatalogEntry, Schema from tap_github.schema import get_schemas +from tap_github.streams import STREAMS LOGGER = singer.get_logger() + +def _build_stream_probe_url(base_url, stream_obj, repo_path, org): + """ + Build a minimal URL to probe whether a stream endpoint is accessible. + Uses per_page=1 to keep the response small. + """ + # Strip any existing query parameters so we control the query string. + base_path = stream_obj.path.split('?')[0] + if stream_obj.use_organization: + url = f"{base_url}/{base_path.format(org)}" + else: + url = f"{base_url}/repos/{repo_path}/{base_path}" + return url + '?per_page=1' + + +def _is_stream_and_ancestors_accessible(stream_name, inaccessible_streams): + """ + Recursively check whether a stream or any of its ancestors is inaccessible. + Returns False if the stream itself or any ancestor appears in inaccessible_streams. + """ + if stream_name in inaccessible_streams: + return False + parent = STREAMS[stream_name].parent + if parent: + return _is_stream_and_ancestors_accessible(parent, inaccessible_streams) + return True + + +def _identify_inaccessible_streams(client, repositories): + """ + Verify repo access and probe each top-level stream endpoint. + Returns a set of stream names that are not accessible (403/404). + """ + # Sort for deterministic probe behavior across runs. + repositories = sorted(repositories) + client.verify_access_for_repo(repositories) + + # Derive org from the first repo to ensure consistency. + repo_path = repositories[0] if repositories else None + org = repo_path.split('/')[0] if repo_path else None + + inaccessible_streams = set() + if repo_path: + for stream_name, stream_class in STREAMS.items(): + if stream_class.parent is None: + test_url = _build_stream_probe_url(client.base_url, stream_class, repo_path, org) + if not client.check_stream_accessible(stream_name, test_url): + inaccessible_streams.add(stream_name) + LOGGER.warning( + "Stream '%s' will be excluded from the catalog: " + "insufficient permissions or resource not found.", + stream_name + ) + return inaccessible_streams + + def discover(client): """ Run the discovery mode, prepare the catalog file and return catalog. + Streams whose API endpoints are not accessible (403/404) are excluded. """ - # Check credential in the discover mode. - client.verify_access_for_repo() + # Extract repos/orgs once and reuse to avoid double API calls. + repositories, _ = client.extract_repos_from_config() + + inaccessible_streams = _identify_inaccessible_streams(client, repositories) schemas, field_metadata = get_schemas() catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): + # Exclude streams that are inaccessible or whose ancestor is inaccessible. + if not _is_stream_and_ancestors_accessible(stream_name, inaccessible_streams): + if stream_name not in inaccessible_streams: + LOGGER.warning( + "Stream '%s' will be excluded from the catalog: " + "parent stream is not accessible.", + stream_name + ) + continue + try: schema = Schema.from_dict(schema_dict) mdata = field_metadata[stream_name] diff --git a/tests/base.py b/tests/base.py index c2e6114c..009b4c63 100644 --- a/tests/base.py +++ b/tests/base.py @@ -180,7 +180,7 @@ def expected_metadata(self): def expected_replication_method(self): """ - Return a dictionary with key of table name + Return a dictionary with key of table name and value of replication method """ return {table: properties.get(self.REPLICATION_METHOD, None) @@ -266,7 +266,8 @@ def run_and_verify_check_mode(self, conn_id): found_catalog_names = set(map(lambda c: c['stream_name'], found_catalogs)) LOGGER.info(found_catalog_names) - self.assertSetEqual(self.expected_streams(), found_catalog_names, msg="discovered schemas do not match") + unexpected_streams = found_catalog_names - self.expected_streams() + self.assertFalse(unexpected_streams, msg="discovered unexpected schemas: {}".format(unexpected_streams)) LOGGER.info("discovered schemas are OK") return found_catalogs diff --git a/tests/test_github_all_fields.py b/tests/test_github_all_fields.py index f3f27a4b..2b61b682 100644 --- a/tests/test_github_all_fields.py +++ b/tests/test_github_all_fields.py @@ -70,14 +70,14 @@ 'mentions_count', 'reactions' }, - 'collaborators': { - 'email', - 'name' - }, 'reviews': { 'body_text', 'body_html' }, + 'collaborators': { + 'email', + 'name' + }, 'teams': { 'permissions' }, diff --git a/tests/unittests/test_main.py b/tests/unittests/test_main.py index 44d5d22c..44141a05 100644 --- a/tests/unittests/test_main.py +++ b/tests/unittests/test_main.py @@ -5,9 +5,9 @@ class MockArgs: """Mock args object class""" - + def __init__(self, config = None, properties = None, state = None, discover = False) -> None: - self.config = config + self.config = config self.properties = properties self.state = state self.discover = discover @@ -20,14 +20,14 @@ class TestDiscoverMode(unittest.TestCase): """ mock_config = {"start_date": "", "access_token": ""} - + @mock.patch("tap_github._discover") def test_discover_with_config(self, mock_discover, mock_args, mock_verify_access): """Test `_discover` function is called for discover mode""" mock_discover.return_value = dict() mock_args.return_value = MockArgs(discover = True, config = self.mock_config) main() - + self.assertTrue(mock_discover.called) @@ -49,22 +49,22 @@ def test_sync_with_properties(self, mock_discover, mock_sync, mock_args, mock_cl mock_client.return_value = "mock_client" mock_args.return_value = MockArgs(config=self.mock_config, properties=self.mock_catalog) main() - + # Verify `_sync` is called with expected arguments mock_sync.assert_called_with("mock_client", self.mock_config, {}, self.mock_catalog) - + # verify `_discover` function is not called self.assertFalse(mock_discover.called) @mock.patch("tap_github._discover") def test_sync_without_properties(self, mock_discover, mock_sync, mock_args, mock_client): """Test sync mode without properties given in args""" - + mock_discover.return_value = {"schema": "", "metadata": ""} mock_client.return_value = "mock_client" mock_args.return_value = MockArgs(config=self.mock_config) main() - + # Verify `_sync` is called with expected arguments mock_sync.assert_called_with("mock_client", self.mock_config, {}, {"schema": "", "metadata": ""}) @@ -77,25 +77,29 @@ def test_sync_with_state(self, mock_sync, mock_args, mock_client): mock_client.return_value = "mock_client" mock_args.return_value = MockArgs(config=self.mock_config, properties=self.mock_catalog, state=mock_state) main() - + # Verify `_sync` is called with expected arguments mock_sync.assert_called_with("mock_client", self.mock_config, mock_state, self.mock_catalog) @mock.patch("tap_github.GithubClient") class TestDiscover(unittest.TestCase): """Test `discover` function.""" - + def test_discover(self, mock_client): - + mock_client.extract_repos_from_config.return_value = (['org/repo'], {'org'}) + mock_client.check_stream_accessible.return_value = True + return_catalog = discover(mock_client) - + self.assertIsInstance(return_catalog, dict) @mock.patch("tap_github.discover.Schema") @mock.patch("tap_github.discover.LOGGER.error") def test_discover_error_handling(self, mock_logger, mock_schema, mock_client): """Test discover function if exception arises.""" - mock_schema.from_dict.side_effect = [Exception] + mock_client.extract_repos_from_config.return_value = (['org/repo'], {'org'}) + mock_client.check_stream_accessible.return_value = True + mock_schema.from_dict.side_effect = Exception with self.assertRaises(Exception): discover(mock_client)