From e28397291c99268325f91531d9c60184703ab185 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Fri, 7 Apr 2023 10:47:40 +0900 Subject: [PATCH 01/37] feat: Add repositories table --- CHANGELOG.md | 3 + tap_github/schemas/repositories.json | 348 +++++++++++++++++++++++++++ tap_github/streams.py | 15 ++ tap_github/sync.py | 2 +- 4 files changed, 367 insertions(+), 1 deletion(-) create mode 100644 tap_github/schemas/repositories.json diff --git a/CHANGELOG.md b/CHANGELOG.md index d84d7ad8..20ce0ebd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# 2.1.0 + * Add repositories table + # 2.0.0 * Schema updates [#170](https://github.com/singer-io/tap-github/pull/170) [#169](https://github.com/singer-io/tap-github/pull/169) * Update data types of fields in `events` and `issue_events` stream diff --git a/tap_github/schemas/repositories.json b/tap_github/schemas/repositories.json new file mode 100644 index 00000000..135fb56d --- /dev/null +++ b/tap_github/schemas/repositories.json @@ -0,0 +1,348 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "owner_id": { + "type": ["null", "integer"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "private": { + "type": ["null", "boolean"] + }, + "html_url": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] + }, + "url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "language": { + "type": ["null", "string"] + }, + "forks_count": { + "type": ["null", "integer"] + }, + "stargazers_count": { + "type": ["null", "integer"] + }, + "watchers_count": { + "type": ["null", "integer"] + }, + "size": { + "type": ["null", "integer"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "open_issues_count": { + "type": ["null", "integer"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["null", "string"] + } + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "has_discussions": { + "type": ["null", "boolean"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "visibility": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "role_name": { + "type": ["null", "string"] + }, + "temp_clone_token": { + "type": ["null", "string"] + }, + "delete_branch_on_merge": { + "type": ["null", "boolean"] + }, + "subscribers_count": { + "type": ["null", "integer"] + }, + "network_count": { + "type": ["null", "integer"] + }, + "forks": { + "type": ["null", "integer"] + }, + "open_issues": { + "type": ["null", "integer"] + }, + "watchers": { + "type": ["null", "integer"] + }, + "allow_forking": { + "type": ["null", "boolean"] + }, + "web_commit_signoff_required": { + "type": ["null", "boolean"] + }, + "code_of_conduct": { + "type": ["null", "object"], + "properties": { + "key": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "body": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + } + } + }, + "license": { + "type": ["null", "object"], + "properties": { + "key": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "spdx_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "node_id": { + "type": ["null", "string"] + } + } + }, + "security_and_analysis": { + "type": ["null", "object"], + "properties": { + "advanced_security": { + "type": ["null", "object"], + "properties": { + "type": ["null", "object"], + "properties": { + "status": { + "type": ["null", "string"] + } + } + } + }, + "secret_scanning": { + "type": ["null", "object"], + "properties": { + "type": ["null", "object"], + "properties": { + "status": { + "type": ["null", "string"] + } + } + } + }, + "secret_scanning_push_protection": { + "type": ["null", "object"], + "properties": { + "type": ["null", "object"], + "properties": { + "status": { + "type": ["null", "string"] + } + } + } + } + } + }, + "archive_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "forks_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + } + } + } + \ No newline at end of file diff --git a/tap_github/streams.py b/tap_github/streams.py index 278dd05a..54d9ddca 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -740,9 +740,24 @@ def add_fields_at_1st_level(self, record, parent_record = None): """ record['user_id'] = record['user']['id'] +class Repositories(FullTableStream): + ''' + https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-organization-repositories + ''' + tap_stream_id = "repositories" + replication_method = "FULL_TABLE" + key_properties = ["id"] + path = "repos" + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + record['owner_id'] = record['owner']['id'] # Dictionary of the stream classes STREAMS = { + "repositories": Repositories, "commits": Commits, "comments": Comments, "issues": Issues, diff --git a/tap_github/sync.py b/tap_github/sync.py index a83610ad..38188b9a 100644 --- a/tap_github/sync.py +++ b/tap_github/sync.py @@ -4,7 +4,7 @@ from tap_github.streams import STREAMS LOGGER = singer.get_logger() -STREAM_TO_SYNC_FOR_ORGS = ['teams', 'team_members', 'team_memberships'] +STREAM_TO_SYNC_FOR_ORGS = ['teams', 'team_members', 'team_memberships', 'repositories'] def get_selected_streams(catalog): ''' From d4ccf649bcb0dbbe073843cfae640dc3cba282f8 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Fri, 7 Apr 2023 14:50:11 +0900 Subject: [PATCH 02/37] fix:Make fixes for repository table schema --- tap_github/schemas/repositories.json | 11 +---------- tap_github/streams.py | 3 ++- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/tap_github/schemas/repositories.json b/tap_github/schemas/repositories.json index 135fb56d..d1b24a59 100644 --- a/tap_github/schemas/repositories.json +++ b/tap_github/schemas/repositories.json @@ -188,34 +188,25 @@ "advanced_security": { "type": ["null", "object"], "properties": { - "type": ["null", "object"], - "properties": { "status": { - "type": ["null", "string"] + "type": ["null", "string"] } - } } }, "secret_scanning": { "type": ["null", "object"], "properties": { - "type": ["null", "object"], - "properties": { "status": { "type": ["null", "string"] } - } } }, "secret_scanning_push_protection": { "type": ["null", "object"], "properties": { - "type": ["null", "object"], - "properties": { "status": { "type": ["null", "string"] } - } } } } diff --git a/tap_github/streams.py b/tap_github/streams.py index 54d9ddca..617e8f1c 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -747,7 +747,8 @@ class Repositories(FullTableStream): tap_stream_id = "repositories" replication_method = "FULL_TABLE" key_properties = ["id"] - path = "repos" + use_organization = True + path = "orgs/{}/repos" def add_fields_at_1st_level(self, record, parent_record = None): """ From 53a36f311777db728b4ee4f0f4ca78f297b31c71 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Fri, 7 Apr 2023 14:55:38 +0900 Subject: [PATCH 03/37] feat:Add user email table & add commit dependency --- tap_github/schemas/commits.json | 9 +++ tap_github/schemas/user_email.json | 15 ++++ tap_github/streams.py | 118 +++++++++++++++++++---------- 3 files changed, 104 insertions(+), 38 deletions(-) create mode 100644 tap_github/schemas/user_email.json diff --git a/tap_github/schemas/commits.json b/tap_github/schemas/commits.json index cf873448..c3828e35 100644 --- a/tap_github/schemas/commits.json +++ b/tap_github/schemas/commits.json @@ -253,6 +253,15 @@ } } }, + "committer_email": { + "type": ["null", "string"] + }, + "committer_id": { + "type": ["null", "integer"] + }, + "committer_name": { + "type": ["null", "string"] + }, "committer": { "$ref": "shared/user.json#/" }, diff --git a/tap_github/schemas/user_email.json b/tap_github/schemas/user_email.json new file mode 100644 index 00000000..d299b5b5 --- /dev/null +++ b/tap_github/schemas/user_email.json @@ -0,0 +1,15 @@ +{ + "type": ["null", "object"], + "properties": { + "committer_email": { + "type": ["null", "string"] + }, + "committer_id": { + "type": ["null", "integer"] + }, + "committer_name": { + "type": ["null", "string"] + } + } + } + \ No newline at end of file diff --git a/tap_github/streams.py b/tap_github/streams.py index 617e8f1c..f11dd7e8 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -27,7 +27,9 @@ def get_child_full_url(domain, child_object, repo_path, parent_id, grand_parent_ Build the child stream's URL based on the parent and the grandparent's ids. """ - if child_object.use_repository: + if child_object.no_path: + return + elif child_object.use_repository: # The `use_repository` represents that the url contains /repos and the repository name. child_full_url = '{}/repos/{}/{}'.format( domain, @@ -68,6 +70,8 @@ class Stream: use_repository = False headers = {'Accept': '*/*'} parent = None + inherit_parent_fields = [] + no_path = False def build_url(self, base_url, repo_path, bookmark): """ @@ -151,50 +155,71 @@ def get_child_records(self, child_full_url = get_child_full_url(client.base_url, child_object, repo_path, parent_id, grand_parent_id) stream_catalog = get_schema(catalog, child_object.tap_stream_id) + if child_full_url is not None: + with metrics.record_counter(child_object.tap_stream_id) as counter: + for response in client.authed_get_all_pages( + child_object.tap_stream_id, + child_full_url, + stream = child_object.tap_stream_id + ): + records = response.json() + extraction_time = singer.utils.now() + + if isinstance(records, list): + # Loop through all the records of response + for record in records: + record['_sdc_repository'] = repo_path + child_object.add_fields_at_1st_level(record = record, parent_record = parent_record) - with metrics.record_counter(child_object.tap_stream_id) as counter: - for response in client.authed_get_all_pages( - child_object.tap_stream_id, - child_full_url, - stream = child_object.tap_stream_id - ): - records = response.json() - extraction_time = singer.utils.now() - - if isinstance(records, list): - # Loop through all the records of response - for record in records: - record['_sdc_repository'] = repo_path - child_object.add_fields_at_1st_level(record = record, parent_record = parent_record) - - with singer.Transformer() as transformer: + with singer.Transformer() as transformer: - rec = transformer.transform(record, stream_catalog['schema'], metadata=metadata.to_map(stream_catalog['metadata'])) + rec = transformer.transform(record, stream_catalog['schema'], metadata=metadata.to_map(stream_catalog['metadata'])) - if child_object.tap_stream_id in selected_stream_ids and record.get(child_object.replication_keys, start_date) >= child_bookmark_value: - singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) - counter.increment() + if child_object.tap_stream_id in selected_stream_ids and record.get(child_object.replication_keys, start_date) >= child_bookmark_value: + singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) + counter.increment() - # Loop thru each child and nested child in the parent and fetch all the child records. - for nested_child in child_object.children: - if nested_child in stream_to_sync: - # Collect id of child record to pass in the API of its sub-child. - child_id = tuple(record.get(key) for key in STREAMS[nested_child]().id_keys) - # Here, grand_parent_id is the id of 1st level parent(main parent) which is required to - # pass in the API of the current child's sub-child. - child_object.get_child_records(client, catalog, nested_child, child_id, repo_path, state, start_date, bookmark_dttm, stream_to_sync, selected_stream_ids, grand_parent_id, record) + # Loop thru each child and nested child in the parent and fetch all the child records. + for nested_child in child_object.children: + if nested_child in stream_to_sync: + # Collect id of child record to pass in the API of its sub-child. + child_id = tuple(record.get(key) for key in STREAMS[nested_child]().id_keys) + # Here, grand_parent_id is the id of 1st level parent(main parent) which is required to + # pass in the API of the current child's sub-child. + child_object.get_child_records(client, catalog, nested_child, child_id, repo_path, state, start_date, bookmark_dttm, stream_to_sync, selected_stream_ids, grand_parent_id, record) - else: - # Write JSON response directly if it is a single record only. - records['_sdc_repository'] = repo_path - child_object.add_fields_at_1st_level(record = records, parent_record = parent_record) + else: + # Write JSON response directly if it is a single record only. + records['_sdc_repository'] = repo_path + child_object.add_fields_at_1st_level(record = records, parent_record = parent_record) - with singer.Transformer() as transformer: + with singer.Transformer() as transformer: - rec = transformer.transform(records, stream_catalog['schema'], metadata=metadata.to_map(stream_catalog['metadata'])) - if child_object.tap_stream_id in selected_stream_ids and records.get(child_object.replication_keys, start_date) >= child_bookmark_value : + rec = transformer.transform(records, stream_catalog['schema'], metadata=metadata.to_map(stream_catalog['metadata'])) + if child_object.tap_stream_id in selected_stream_ids and records.get(child_object.replication_keys, start_date) >= child_bookmark_value : - singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) + singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) + elif child_object.no_path: + extraction_time = singer.utils.now() + record = {} + for field in child_object.inherit_parent_fields: + record[field] = parent_record.get(field) + with singer.Transformer() as transformer: + + rec = transformer.transform(record, stream_catalog['schema'], metadata=metadata.to_map(stream_catalog['metadata'])) + + if child_object.tap_stream_id in selected_stream_ids: + singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) + counter.increment() + + # Loop thru each child and nested child in the parent and fetch all the child records. + for nested_child in child_object.children: + if nested_child in stream_to_sync: + # Collect id of child record to pass in the API of its sub-child. + child_id = tuple(record.get(key) for key in STREAMS[nested_child]().id_keys) + # Here, grand_parent_id is the id of 1st level parent(main parent) which is required to + # pass in the API of the current child's sub-child. + child_object.get_child_records(client, catalog, nested_child, child_id, repo_path, state, start_date, bookmark_dttm, stream_to_sync, selected_stream_ids, grand_parent_id, record) # pylint: disable=unnecessary-pass def add_fields_at_1st_level(self, record, parent_record = None): @@ -618,6 +643,7 @@ class Commits(IncrementalStream): replication_keys = "updated_at" key_properties = ["sha"] path = "commits" + children= ["user_email"] filter_param = True def add_fields_at_1st_level(self, record, parent_record = None): @@ -625,6 +651,21 @@ def add_fields_at_1st_level(self, record, parent_record = None): Add fields in the record explicitly at the 1st level of JSON. """ record['updated_at'] = record['commit']['committer']['date'] + record['comitter_email'] = record['commit']['committer']['email'] + record['committer_id'] = record['commit']['committer']['id'] + record['committer_name'] = record['commit']['committer']['name'] + record['committer_login'] = record['commit']['committer']['login'] + +class UserEmail(IncrementalStream): + ''' + https://docs.github.com/en/rest/users/users?apiVersion=2022-11-28 + ''' + tap_stream_id = "user_email" + replication_method = "INCREMENTAL" + key_properties = ["comitter_email"] + id_keys = ['committer_login'] + no_path = True + inherit_parent_fields = ["comitter_email","committer_id","committer_name","committer_login"] class Comments(IncrementalOrderedStream): ''' @@ -780,5 +821,6 @@ def add_fields_at_1st_level(self, record, parent_record = None): "team_members": TeamMembers, "team_memberships": TeamMemberships, "collaborators": Collaborators, - "stargazers": StarGazers + "stargazers": StarGazers, + "user_email": UserEmail } From 67fbafd46692c4c3b4deb81d816e7eb1d3da9054 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Mon, 10 Apr 2023 15:19:23 +0900 Subject: [PATCH 04/37] fix:Fix user email table --- ...er_email.json => commit_users_emails.json} | 9 +- tap_github/schemas/commits.json | 21 ++- tap_github/streams.py | 123 ++++++++++-------- 3 files changed, 84 insertions(+), 69 deletions(-) rename tap_github/schemas/{user_email.json => commit_users_emails.json} (63%) diff --git a/tap_github/schemas/user_email.json b/tap_github/schemas/commit_users_emails.json similarity index 63% rename from tap_github/schemas/user_email.json rename to tap_github/schemas/commit_users_emails.json index d299b5b5..0cc70e8a 100644 --- a/tap_github/schemas/user_email.json +++ b/tap_github/schemas/commit_users_emails.json @@ -1,13 +1,16 @@ { "type": ["null", "object"], "properties": { - "committer_email": { + "email": { "type": ["null", "string"] }, - "committer_id": { + "id": { "type": ["null", "integer"] }, - "committer_name": { + "name": { + "type": ["null", "string"] + }, + "username": { "type": ["null", "string"] } } diff --git a/tap_github/schemas/commits.json b/tap_github/schemas/commits.json index c3828e35..a66ff12f 100644 --- a/tap_github/schemas/commits.json +++ b/tap_github/schemas/commits.json @@ -7,15 +7,6 @@ "node_id": { "type": ["null", "string"] }, - "pr_id": { - "type": ["null", "string"] - }, - "pr_number": { - "type": ["null", "integer"] - }, - "id": { - "type": ["null", "string"] - }, "updated_at": { "type": ["null", "string"], "format": "date-time" @@ -26,6 +17,9 @@ "url": { "type": ["null", "string"] }, + "message": { + "type": ["null", "string"] + }, "parents": { "type": ["null", "array"], "items": { @@ -253,11 +247,14 @@ } } }, - "committer_email": { + "author_email": { + "type": ["null", "string"] + }, + "author_name": { "type": ["null", "string"] }, - "committer_id": { - "type": ["null", "integer"] + "committer_email": { + "type": ["null", "string"] }, "committer_name": { "type": ["null", "string"] diff --git a/tap_github/streams.py b/tap_github/streams.py index f11dd7e8..ecdbbcbe 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -155,8 +155,8 @@ def get_child_records(self, child_full_url = get_child_full_url(client.base_url, child_object, repo_path, parent_id, grand_parent_id) stream_catalog = get_schema(catalog, child_object.tap_stream_id) - if child_full_url is not None: - with metrics.record_counter(child_object.tap_stream_id) as counter: + with metrics.record_counter(child_object.tap_stream_id) as counter: + if child_full_url is not None: for response in client.authed_get_all_pages( child_object.tap_stream_id, child_full_url, @@ -199,27 +199,28 @@ def get_child_records(self, if child_object.tap_stream_id in selected_stream_ids and records.get(child_object.replication_keys, start_date) >= child_bookmark_value : singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) - elif child_object.no_path: - extraction_time = singer.utils.now() - record = {} - for field in child_object.inherit_parent_fields: - record[field] = parent_record.get(field) - with singer.Transformer() as transformer: - - rec = transformer.transform(record, stream_catalog['schema'], metadata=metadata.to_map(stream_catalog['metadata'])) - - if child_object.tap_stream_id in selected_stream_ids: - singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) - counter.increment() - - # Loop thru each child and nested child in the parent and fetch all the child records. - for nested_child in child_object.children: - if nested_child in stream_to_sync: - # Collect id of child record to pass in the API of its sub-child. - child_id = tuple(record.get(key) for key in STREAMS[nested_child]().id_keys) - # Here, grand_parent_id is the id of 1st level parent(main parent) which is required to - # pass in the API of the current child's sub-child. - child_object.get_child_records(client, catalog, nested_child, child_id, repo_path, state, start_date, bookmark_dttm, stream_to_sync, selected_stream_ids, grand_parent_id, record) + elif child_object.no_path: + extraction_time = singer.utils.now() + record = {} + for column, field in child_object.inherit_parent_fields: + record[column] = parent_record.get(field) + with singer.Transformer() as transformer: + + rec = transformer.transform(record, stream_catalog['schema'], metadata=metadata.to_map(stream_catalog['metadata'])) + + if child_object.tap_stream_id in selected_stream_ids: + singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) + counter.increment() + + # Loop thru each child and nested child in the parent and fetch all the child records. + for nested_child in child_object.children: + if nested_child in stream_to_sync: + # Collect id of child record to pass in the API of its sub-child. + child_id = tuple(record.get(key) for key in STREAMS[nested_child]().id_keys) + if STREAMS[nested_child]().id_keys and not all(child_id): continue + # Here, grand_parent_id is the id of 1st level parent(main parent) which is required to + # pass in the API of the current child's sub-child. + child_object.get_child_records(client, catalog, nested_child, child_id, repo_path, state, start_date, bookmark_dttm, stream_to_sync, selected_stream_ids, grand_parent_id, record) # pylint: disable=unnecessary-pass def add_fields_at_1st_level(self, record, parent_record = None): @@ -227,6 +228,16 @@ def add_fields_at_1st_level(self, record, parent_record = None): Add fields in the record explicitly at the 1st level of JSON. """ pass + + def get_field(self,record, field_path): + """ + Get a field of a record from a field path + """ + response = record + for path in field_path: + response = response.get(path) + if not response: return + return response class FullTableStream(Stream): def sync_endpoint(self, @@ -246,7 +257,6 @@ def sync_endpoint(self, full_url = self.build_url(client.base_url, repo_path, None) stream_catalog = get_schema(catalog, self.tap_stream_id) - with metrics.record_counter(self.tap_stream_id) as counter: for response in client.authed_get_all_pages( self.tap_stream_id, @@ -287,7 +297,6 @@ def sync_endpoint(self, stream_to_sync, selected_stream_ids, parent_record = record) - return state class IncrementalStream(Stream): @@ -329,7 +338,6 @@ def sync_endpoint(self, extraction_time = singer.utils.now() # Loop through all records for record in records: - record['_sdc_repository'] = repo_path self.add_fields_at_1st_level(record = record, parent_record = None) @@ -343,7 +351,6 @@ def sync_endpoint(self, # Keep only records whose bookmark is after the last_datetime if bookmark_dttm >= min_bookmark_value: - if self.tap_stream_id in selected_stream_ids and bookmark_dttm >= parent_bookmark_value: rec = transformer.transform(record, stream_catalog['schema'], metadata=metadata.to_map(stream_catalog['metadata'])) @@ -354,24 +361,24 @@ def sync_endpoint(self, if child in stream_to_sync: parent_id = tuple(record.get(key) for key in STREAMS[child]().id_keys) - - # Sync child stream, if it is selected or its nested child is selected. - self.get_child_records(client, - catalog, - child, - parent_id, - repo_path, - state, - start_date, - record.get(self.replication_keys), - stream_to_sync, - selected_stream_ids, - parent_record = record) + if STREAMS[child]().id_keys and not all(parent_id): + pass + else: + # Sync child stream, if it is selected or its nested child is selected. + self.get_child_records(client, + catalog, + child, + parent_id, + repo_path, + state, + start_date, + record.get(self.replication_keys), + stream_to_sync, + selected_stream_ids) else: LOGGER.warning("Skipping this record for %s stream with %s = %s as it is missing replication key %s.", self.tap_stream_id, self.key_properties, record[self.key_properties], self.replication_keys) - # Write bookmark for incremental stream. self.write_bookmarks(self.tap_stream_id, selected_stream_ids, max_bookmark_value, repo_path, state) @@ -643,29 +650,36 @@ class Commits(IncrementalStream): replication_keys = "updated_at" key_properties = ["sha"] path = "commits" - children= ["user_email"] + children= ["commit_users_emails"] filter_param = True def add_fields_at_1st_level(self, record, parent_record = None): """ Add fields in the record explicitly at the 1st level of JSON. """ - record['updated_at'] = record['commit']['committer']['date'] - record['comitter_email'] = record['commit']['committer']['email'] - record['committer_id'] = record['commit']['committer']['id'] - record['committer_name'] = record['commit']['committer']['name'] - record['committer_login'] = record['commit']['committer']['login'] + if not record: return + record['updated_at'] = self.get_field(record,['commit','committer','date']) + record['message'] = self.get_field(record,['commit','message']) + record['comit_name'] = self.get_field(record,['commit','committer','name']) + record['author_email'] = self.get_field(record,['commit','author','email']) + record['author_id'] = self.get_field(record,['author','id']) + record['author_name'] = self.get_field(record,['commit','author','name']) + record['author_login'] = self.get_field(record,['author','login']) + record['committer_email'] = self.get_field(record,['commit','committer','email']) + record['committer_name'] = self.get_field(record,['commit','committer','name']) + class UserEmail(IncrementalStream): ''' - https://docs.github.com/en/rest/users/users?apiVersion=2022-11-28 + Created from fields of Commits table ''' - tap_stream_id = "user_email" + tap_stream_id = "commit_users_emails" replication_method = "INCREMENTAL" - key_properties = ["comitter_email"] - id_keys = ['committer_login'] + key_properties = ["email"] + id_keys = ['author_email'] no_path = True - inherit_parent_fields = ["comitter_email","committer_id","committer_name","committer_login"] + inherit_parent_fields = [("email","author_email"),("id","author_id"),("name","author_name"),("username","author_login")] + parent = 'commits' class Comments(IncrementalOrderedStream): ''' @@ -795,7 +809,8 @@ def add_fields_at_1st_level(self, record, parent_record = None): """ Add fields in the record explicitly at the 1st level of JSON. """ - record['owner_id'] = record['owner']['id'] + if not record: return + record['owner_id'] = self.get_field(record,['owner','id']) # Dictionary of the stream classes STREAMS = { @@ -822,5 +837,5 @@ def add_fields_at_1st_level(self, record, parent_record = None): "team_memberships": TeamMemberships, "collaborators": Collaborators, "stargazers": StarGazers, - "user_email": UserEmail + "commit_users_emails": UserEmail, } From a123b298e72f89cbb6b132ce7bb118e69aeff69d Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Tue, 11 Apr 2023 12:46:24 +0900 Subject: [PATCH 05/37] feat: Add release_assets table --- tap_github/schemas/release_assets.json | 121 +++++++++++++++++++++++++ tap_github/schemas/releases.json | 115 ----------------------- tap_github/streams.py | 103 +++++++++++++-------- 3 files changed, 188 insertions(+), 151 deletions(-) create mode 100644 tap_github/schemas/release_assets.json diff --git a/tap_github/schemas/release_assets.json b/tap_github/schemas/release_assets.json new file mode 100644 index 00000000..8f9f74b9 --- /dev/null +++ b/tap_github/schemas/release_assets.json @@ -0,0 +1,121 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["string"] + }, + "release_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "browser_download_url": { + "type": ["null", "string"], + "format": "uri" + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "label": { + "type": ["null", "string"] + }, + "state": { + "type": ["null", "string"] + }, + "content_type": { + "type": ["null", "string"] + }, + "size": { + "type": ["null", "integer"] + }, + "download_count": { + "type": ["null", "integer"] + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "uploader_id": { + "type": ["null", "integer"] + }, + "uploader": { + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] + } + } + } + } + } \ No newline at end of file diff --git a/tap_github/schemas/releases.json b/tap_github/schemas/releases.json index b903a026..e4647e76 100644 --- a/tap_github/schemas/releases.json +++ b/tap_github/schemas/releases.json @@ -35,121 +35,6 @@ "reactions": { "$ref": "shared/reactions.json#/" }, - "assets": { - "type": ["null", "array"], - "items": { - "type": ["null", "object"], - "properties": { - "url": { - "type": ["null", "string"] - }, - "browser_download_url": { - "type": ["null", "string"], - "format": "uri" - }, - "id": { - "type": ["null", "integer"] - }, - "node_id": { - "type": ["null", "string"] - }, - "name": { - "type": ["null", "string"] - }, - "label": { - "type": ["null", "string"] - }, - "state": { - "type": ["null", "string"] - }, - "content_type": { - "type": ["null", "string"] - }, - "size": { - "type": ["null", "integer"] - }, - "download_count": { - "type": ["null", "integer"] - }, - "created_at": { - "type": ["null", "string"], - "format": "date-time" - }, - "updated_at": { - "type": ["null", "string"], - "format": "date-time" - }, - "uploader": { - "type": ["null", "object"], - "properties": { - "name": { - "type": ["null", "string"] - }, - "email": { - "type": ["null", "string"] - }, - "login": { - "type": ["null", "string"] - }, - "id": { - "type": ["null", "integer"] - }, - "node_id": { - "type": ["null", "string"] - }, - "avatar_url": { - "type": ["null", "string"] - }, - "gravatar_id": { - "type": ["null", "string"] - }, - "url": { - "type": ["null", "string"] - }, - "html_url": { - "type": ["null", "string"] - }, - "followers_url": { - "type": ["null", "string"] - }, - "following_url": { - "type": ["null", "string"] - }, - "gists_url": { - "type": ["null", "string"] - }, - "starred_url": { - "type": ["null", "string"] - }, - "subscriptions_url": { - "type": ["null", "string"] - }, - "organizations_url": { - "type": ["null", "string"] - }, - "repos_url": { - "type": ["null", "string"] - }, - "events_url": { - "type": ["null", "string"] - }, - "received_events_url": { - "type": ["null", "string"] - }, - "type": { - "type": ["null", "string"] - }, - "site_admin": { - "type": ["null", "boolean"] - }, - "starred_at": { - "type": ["null", "string"] - } - } - } - } - } - }, "mentions_count": { "type": ["null", "integer"] }, diff --git a/tap_github/streams.py b/tap_github/streams.py index ecdbbcbe..b308830e 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -71,6 +71,7 @@ class Stream: headers = {'Accept': '*/*'} parent = None inherit_parent_fields = [] + inherit_array_parent_fields = "" no_path = False def build_url(self, base_url, repo_path, bookmark): @@ -200,27 +201,32 @@ def get_child_records(self, singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) elif child_object.no_path: - extraction_time = singer.utils.now() - record = {} - for column, field in child_object.inherit_parent_fields: - record[column] = parent_record.get(field) - with singer.Transformer() as transformer: - - rec = transformer.transform(record, stream_catalog['schema'], metadata=metadata.to_map(stream_catalog['metadata'])) - - if child_object.tap_stream_id in selected_stream_ids: - singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) - counter.increment() - - # Loop thru each child and nested child in the parent and fetch all the child records. - for nested_child in child_object.children: - if nested_child in stream_to_sync: - # Collect id of child record to pass in the API of its sub-child. - child_id = tuple(record.get(key) for key in STREAMS[nested_child]().id_keys) - if STREAMS[nested_child]().id_keys and not all(child_id): continue - # Here, grand_parent_id is the id of 1st level parent(main parent) which is required to - # pass in the API of the current child's sub-child. - child_object.get_child_records(client, catalog, nested_child, child_id, repo_path, state, start_date, bookmark_dttm, stream_to_sync, selected_stream_ids, grand_parent_id, record) + records = [] + if child_object.inherit_array_parent_fields: + for record in parent_record.get(child_object.inherit_array_parent_fields): + records.append(record) + else: records.append({}) + for record in records: + for column, field in child_object.inherit_parent_fields: + record[column] = parent_record.get(field) + child_object.add_fields_at_1st_level(record = record, parent_record = parent_record) + with singer.Transformer() as transformer: + + rec = transformer.transform(record, stream_catalog['schema'], metadata=metadata.to_map(stream_catalog['metadata'])) + + if child_object.tap_stream_id in selected_stream_ids: + singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) + counter.increment() + + # Loop thru each child and nested child in the parent and fetch all the child records. + for nested_child in child_object.children: + if nested_child in stream_to_sync: + # Collect id of child record to pass in the API of its sub-child. + child_id = tuple(record.get(key) for key in STREAMS[nested_child]().id_keys) + if STREAMS[nested_child]().id_keys and not all(child_id): continue + # Here, grand_parent_id is the id of 1st level parent(main parent) which is required to + # pass in the API of the current child's sub-child. + child_object.get_child_records(client, catalog, nested_child, child_id, repo_path, state, start_date, bookmark_dttm, stream_to_sync, selected_stream_ids, grand_parent_id, record) # pylint: disable=unnecessary-pass def add_fields_at_1st_level(self, record, parent_record = None): @@ -284,19 +290,21 @@ def sync_endpoint(self, if child in stream_to_sync: parent_id = tuple(record.get(key) for key in STREAMS[child]().id_keys) - - # Sync child stream, if it is selected or its nested child is selected. - self.get_child_records(client, - catalog, - child, - parent_id, - repo_path, - state, - start_date, - record.get(self.replication_keys), - stream_to_sync, - selected_stream_ids, - parent_record = record) + if STREAMS[child]().id_keys and not all(parent_id): + pass + else: + # Sync child stream, if it is selected or its nested child is selected. + self.get_child_records(client, + catalog, + child, + parent_id, + repo_path, + state, + start_date, + record.get(self.replication_keys), + stream_to_sync, + selected_stream_ids, + parent_record = record) return state class IncrementalStream(Stream): @@ -638,7 +646,7 @@ class Teams(FullTableStream): key_properties = ["id"] path = "orgs/{}/teams" use_organization = True - children= ["team_members"] + children = ["team_members"] pk_child_fields = ['slug'] class Commits(IncrementalStream): @@ -676,7 +684,7 @@ class UserEmail(IncrementalStream): tap_stream_id = "commit_users_emails" replication_method = "INCREMENTAL" key_properties = ["email"] - id_keys = ['author_email'] + id_keys = ["author_email"] no_path = True inherit_parent_fields = [("email","author_email"),("id","author_id"),("name","author_name"),("username","author_login")] parent = 'commits' @@ -720,6 +728,28 @@ class Releases(FullTableStream): replication_method = "FULL_TABLE" key_properties = ["id"] path = "releases?sort=created_at&direction=desc" + chldren = ["release_assets"] + +class ReleaseAssets(FullTableStream): + ''' + https://docs.github.com/en/rest/releases/releases#list-releases + ''' + tap_stream_id = "release_assets" + replication_method = "FULL_TABLE" + key_properties = ["id"] + use_repository = True + id_keys = ["id"] + no_path = True + inherit_parent_fields = [("release_id","id")] + inherit_array_parent_fields = "assets" + parent = 'commits' + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + if not record: return + record['uploader_id'] = self.get_field(record,['uploader','id']) class IssueLabels(FullTableStream): ''' @@ -820,6 +850,7 @@ def add_fields_at_1st_level(self, record, parent_record = None): "issues": Issues, "assignees": Assignees, "releases": Releases, + "release_assets": ReleaseAssets, "issue_labels": IssueLabels, "issue_events": IssueEvents, "events": Events, From a35a4509c498d41e4a59a14f46de552e308c1f2b Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Tue, 11 Apr 2023 15:57:48 +0900 Subject: [PATCH 06/37] fix:Fixes for ReleaseAssets table --- tap_github/streams.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index b308830e..b49aad95 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -728,7 +728,7 @@ class Releases(FullTableStream): replication_method = "FULL_TABLE" key_properties = ["id"] path = "releases?sort=created_at&direction=desc" - chldren = ["release_assets"] + children = ["release_assets"] class ReleaseAssets(FullTableStream): ''' @@ -740,9 +740,9 @@ class ReleaseAssets(FullTableStream): use_repository = True id_keys = ["id"] no_path = True - inherit_parent_fields = [("release_id","id")] + inherit_parent_fields = [("release_id","id"), ("_sdc_repository","_sdc_repository")] inherit_array_parent_fields = "assets" - parent = 'commits' + parent = 'releases' def add_fields_at_1st_level(self, record, parent_record = None): """ From 6700e43212a0c1b167e4cda45a7611db70481ce5 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Tue, 11 Apr 2023 16:45:51 +0900 Subject: [PATCH 07/37] feat: Add branches table --- tap_github/schemas/branches.json | 448 +++++++++++++++++++++++++++++++ tap_github/streams.py | 17 ++ 2 files changed, 465 insertions(+) create mode 100644 tap_github/schemas/branches.json diff --git a/tap_github/schemas/branches.json b/tap_github/schemas/branches.json new file mode 100644 index 00000000..ff1d55cf --- /dev/null +++ b/tap_github/schemas/branches.json @@ -0,0 +1,448 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["string"] + }, + "name": { + "type": ["null", "string"] + }, + "commit_sha": { + "type": ["null", "string"] + }, + "commit": { + "type": ["null", "object"], + "properties": { + "sha": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"], + "format": "uri" + } + } + }, + "protected": { + "type": ["null", "boolean"] + }, + "protection": { + "type": ["null", "object"], + "properties": { + "url": { + "type": ["null", "string"] + }, + "enabled": { + "type": ["null", "boolean"] + }, + "required_status_checks": { + "type": ["null", "object"], + "properties": { + "url": { + "type": ["null", "string"] + }, + "enforcement_level": { + "type": ["null", "string"] + }, + "contexts": { + "type": ["null", "array"], + "items": { + "type": ["null", "string"] + } + }, + "checks": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"], + "properties": { + "context": { + "type": ["null", "string"] + }, + "app_id": { + "type": ["null", "integer"] + } + } + } + }, + "contexts_url": { + "type": ["null", "string"] + }, + "strict": { + "type": ["null", "boolean"] + } + } + }, + "enforce_admins": { + "type": ["null", "object"], + "properties": { + "url": { + "type": ["null", "string"], + "format": "uri" + }, + "enabled": { + "type": ["null", "boolean"] + } + } + }, + "required_pull_request_reviews": { + "type": ["null", "object"], + "properties": { + "url": { + "type": ["null", "string"], + "format": "uri" + }, + "dismissal_restrictions": { + "type": ["null", "object"], + "properties": { + "users": { + "type": ["null", "array"], + "items": { + "$ref": "shared/user.json#/" + } + }, + "teams": { + "type": ["null", "array"], + "items": { + "$ref": "teams.json#/" + } + }, + "apps": { + "type": ["null", "array"], + "items": { + "id": { + "type": ["null", "number"] + }, + "slug": { + "type": ["null", "string"] + }, + "node_id": { + "type": ["null", "string"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "name": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "external_url": { + "type": ["null", "string"], + "format": "uri" + }, + "html_url": { + "type": ["null", "string"], + "format": "uri" + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "events": { + "type": ["null", "array"], + "items": { + "type": ["null", "string"] + } + }, + "installations_count": { + "type": ["null", "integer"] + }, + "client_id": { + "type": ["null", "string"] + }, + "client_secret": { + "type": ["null", "string"] + }, + "webhook_secret": { + "type": ["null", "string"] + }, + "pem": { + "type": ["null", "string"] + } + } + }, + "users_url": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + } + } + }, + "bypass_pull_request_allowances": { + "type": ["null", "object"], + "properties": { + "users": { + "type": ["null", "array"], + "items": { + "$ref": "shared/user.json#/" + } + }, + "teams": { + "type": ["null", "array"], + "items": { + "$ref": "teams.json#/" + } + }, + "apps": { + "type": ["null", "array"], + "items": { + "id": { + "type": ["null", "number"] + }, + "slug": { + "type": ["null", "string"] + }, + "node_id": { + "type": ["null", "string"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "name": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "external_url": { + "type": ["null", "string"], + "format": "uri" + }, + "html_url": { + "type": ["null", "string"], + "format": "uri" + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "events": { + "type": ["null", "array"], + "items": { + "type": ["null", "string"] + } + }, + "installations_count": { + "type": ["null", "integer"] + }, + "client_id": { + "type": ["null", "string"] + }, + "client_secret": { + "type": ["null", "string"] + }, + "webhook_secret": { + "type": ["null", "string"] + }, + "pem": { + "type": ["null", "string"] + } + } + } + } + }, + "dismiss_stale_reviews": { + "type": ["null", "boolean"] + }, + "require_code_owner_reviews": { + "type": ["null", "boolean"] + }, + "required_approving_review_count": { + "type": ["null", "integer"] + }, + "require_last_push_approval": { + "type": ["null", "boolean"] + } + } + }, + "restrictions": { + "type": ["null", "object"], + "properties": { + "users": { + "type": ["null", "array"], + "items": { + "$ref": "shared/user.json#/" + } + }, + "teams": { + "type": ["null", "array"], + "items": { + "$ref": "teams.json#/" + } + }, + "apps": { + "type": ["null", "array"], + "items": { + "id": { + "type": ["null", "number"] + }, + "slug": { + "type": ["null", "string"] + }, + "node_id": { + "type": ["null", "string"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "name": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "external_url": { + "type": ["null", "string"], + "format": "uri" + }, + "html_url": { + "type": ["null", "string"], + "format": "uri" + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "events": { + "type": ["null", "array"], + "items": { + "type": ["null", "string"] + } + }, + "installations_count": { + "type": ["null", "integer"] + }, + "client_id": { + "type": ["null", "string"] + }, + "client_secret": { + "type": ["null", "string"] + }, + "webhook_secret": { + "type": ["null", "string"] + }, + "pem": { + "type": ["null", "string"] + } + } + }, + "users_url": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "apps_url": { + "type": ["null", "string"] + } + } + }, + "required_linear_history": { + "type": ["null", "object"], + "properties": { + "enabled": { + "type": ["null", "boolean"] + } + } + }, + "allow_force_pushes": { + "type": ["null", "object"], + "properties": { + "enabled": { + "type": ["null", "boolean"] + } + } + }, + "allow_deletions": { + "type": ["null", "object"], + "properties": { + "enabled": { + "type": ["null", "boolean"] + } + } + }, + "block_creations": { + "type": ["null", "object"], + "properties": { + "enabled": { + "type": ["null", "boolean"] + } + } + }, + "required_conversation_resolution": { + "type": ["null", "object"], + "properties": { + "enabled": { + "type": ["null", "boolean"] + } + } + }, + "name": { + "type": ["null", "string"] + }, + "protection_url": { + "type": ["null", "string"] + }, + "required_signatures": { + "type": ["null", "object"], + "properties": { + "url": { + "type": ["null", "string"], + "format": "uri" + }, + "enabled": { + "type": ["null", "boolean"] + } + } + }, + "lock_branch": { + "type": ["null", "object"], + "properties": { + "enabled": { + "type": ["null", "boolean"] + } + } + }, + "allow_fork_syncing": { + "type": ["null", "object"], + "properties": { + "enabled": { + "type": ["null", "boolean"] + } + } + } + } + }, + "protection_url": { + "type": ["null", "string"] + } + } + } \ No newline at end of file diff --git a/tap_github/streams.py b/tap_github/streams.py index b49aad95..3b925848 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -202,6 +202,7 @@ def get_child_records(self, singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) elif child_object.no_path: records = [] + extraction_time = singer.utils.now() if child_object.inherit_array_parent_fields: for record in parent_record.get(child_object.inherit_array_parent_fields): records.append(record) @@ -751,6 +752,21 @@ def add_fields_at_1st_level(self, record, parent_record = None): if not record: return record['uploader_id'] = self.get_field(record,['uploader','id']) +class Branches(FullTableStream): + ''' + https://docs.github.com/en/rest/branches/branches#list-branches + ''' + tap_stream_id = "branches" + replication_method = "FULL_TABLE" + key_properties = ["name"] + path = "branches" + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + if not record: return + record['commit_sha'] = self.get_field(record,['commit','sha']) class IssueLabels(FullTableStream): ''' https://docs.github.com/en/rest/issues/labels#list-labels-for-a-repository @@ -851,6 +867,7 @@ def add_fields_at_1st_level(self, record, parent_record = None): "assignees": Assignees, "releases": Releases, "release_assets": ReleaseAssets, + "branches": Branches, "issue_labels": IssueLabels, "issue_events": IssueEvents, "events": Events, From d0a82b941fcaae8c4b8c195a773c833fbafed715 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Tue, 11 Apr 2023 17:23:11 +0900 Subject: [PATCH 08/37] feat: Add commit_files and commit_parents table --- tap_github/schemas/commit_files.json | 44 +++++++++++++++++++++++ tap_github/schemas/commit_parents.json | 22 ++++++++++++ tap_github/schemas/commits.json | 50 -------------------------- tap_github/streams.py | 28 ++++++++++++++- 4 files changed, 93 insertions(+), 51 deletions(-) create mode 100644 tap_github/schemas/commit_files.json create mode 100644 tap_github/schemas/commit_parents.json diff --git a/tap_github/schemas/commit_files.json b/tap_github/schemas/commit_files.json new file mode 100644 index 00000000..86efcbaf --- /dev/null +++ b/tap_github/schemas/commit_files.json @@ -0,0 +1,44 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["string"] + }, + "commit_sha": { + "type": ["null", "string"] + }, + "sha": { + "type": ["null", "string"] + }, + "filename": { + "type": ["null", "string"] + }, + "additions": { + "type": ["null", "number"] + }, + "deletions": { + "type": ["null", "number"] + }, + "changes": { + "type": ["null", "number"] + }, + "status": { + "type": ["null", "string"] + }, + "raw_url": { + "type": ["null", "string"] + }, + "blob_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "patch": { + "type": ["null", "string"] + }, + "previous_filename": { + "type": ["null", "string"] + } + } +} \ No newline at end of file diff --git a/tap_github/schemas/commit_parents.json b/tap_github/schemas/commit_parents.json new file mode 100644 index 00000000..42744838 --- /dev/null +++ b/tap_github/schemas/commit_parents.json @@ -0,0 +1,22 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["string"] + }, + "children_sha": { + "type": ["null", "string"] + }, + "sha": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"], + "format": "uri" + }, + "html_url": { + "type": ["null", "string"], + "format": "uri" + } + } +} \ No newline at end of file diff --git a/tap_github/schemas/commits.json b/tap_github/schemas/commits.json index a66ff12f..29de80bc 100644 --- a/tap_github/schemas/commits.json +++ b/tap_github/schemas/commits.json @@ -20,56 +20,6 @@ "message": { "type": ["null", "string"] }, - "parents": { - "type": ["null", "array"], - "items": { - "type": ["null", "object"], - "additionalProperties": false, - "properties": { - "sha": { - "type": ["null", "string"] - }, - "url": { - "type": ["null", "string"] - }, - "html_url": { - "type": ["null", "string"] - } - } - } - }, - "files": { - "type": ["null", "array"], - "items": { - "type": ["null", "object"], - "properties": { - "filename": { - "type": ["null", "string"] - }, - "additions": { - "type": ["null", "number"] - }, - "deletions": { - "type": ["null", "number"] - }, - "changes": { - "type": ["null", "number"] - }, - "status": { - "type": ["null", "string"] - }, - "raw_url": { - "type": ["null", "string"] - }, - "blob_url": { - "type": ["null", "string"] - }, - "patch": { - "type": ["null", "string"] - } - } - } - }, "html_url": { "type": ["null", "string"] }, diff --git a/tap_github/streams.py b/tap_github/streams.py index 3b925848..f0de85bd 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -659,7 +659,7 @@ class Commits(IncrementalStream): replication_keys = "updated_at" key_properties = ["sha"] path = "commits" - children= ["commit_users_emails"] + children= ["commit_users_emails", "commit_files", "commit_parents"] filter_param = True def add_fields_at_1st_level(self, record, parent_record = None): @@ -677,6 +677,30 @@ def add_fields_at_1st_level(self, record, parent_record = None): record['committer_email'] = self.get_field(record,['commit','committer','email']) record['committer_name'] = self.get_field(record,['commit','committer','name']) +class CommitFiles(IncrementalStream): + ''' + https://docs.github.com/en/rest/commits/commits#list-commits-on-a-repository + ''' + tap_stream_id = "commit_files" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" + key_properties = ["commit_sha", "filename"] + no_path = True + inherit_parent_fields = [("commit_sha","sha"), ("_sdc_repository","_sdc_repository")] + inherit_array_parent_fields = "files" + +class CommitParents(IncrementalStream): + ''' + https://docs.github.com/en/rest/commits/commits#list-commits-on-a-repository + ''' + tap_stream_id = "commit_parents" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" + key_properties = ["child_sha","sha"] + no_path = True + inherit_parent_fields = [("children_sha","sha"), ("_sdc_repository","_sdc_repository")] + inherit_array_parent_fields = "parents" + class UserEmail(IncrementalStream): ''' @@ -862,6 +886,8 @@ def add_fields_at_1st_level(self, record, parent_record = None): STREAMS = { "repositories": Repositories, "commits": Commits, + "commit_files": CommitFiles, + "commit_parents": CommitParents, "comments": Comments, "issues": Issues, "assignees": Assignees, From ee5149cc284a11f5df9f2dca56b249a7df988b21 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Tue, 11 Apr 2023 17:35:06 +0900 Subject: [PATCH 09/37] feat:Add commit_pull_request table --- tap_github/schemas/commit_pull_request.json | 15 ++++++++++++++ tap_github/streams.py | 23 ++++++++++++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 tap_github/schemas/commit_pull_request.json diff --git a/tap_github/schemas/commit_pull_request.json b/tap_github/schemas/commit_pull_request.json new file mode 100644 index 00000000..9d9797df --- /dev/null +++ b/tap_github/schemas/commit_pull_request.json @@ -0,0 +1,15 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["string"] + }, + "commit_sha": { + "type": ["null", "string"] + }, + "pull_request_id": { + "type": ["null", "integer"] + } + } +} + \ No newline at end of file diff --git a/tap_github/streams.py b/tap_github/streams.py index f0de85bd..222dbb8e 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -659,7 +659,7 @@ class Commits(IncrementalStream): replication_keys = "updated_at" key_properties = ["sha"] path = "commits" - children= ["commit_users_emails", "commit_files", "commit_parents"] + children= ["commit_users_emails", "commit_files", "commit_parents", "commit_pull_request"] filter_param = True def add_fields_at_1st_level(self, record, parent_record = None): @@ -701,6 +701,26 @@ class CommitParents(IncrementalStream): inherit_parent_fields = [("children_sha","sha"), ("_sdc_repository","_sdc_repository")] inherit_array_parent_fields = "parents" +class CommitPullRequest(IncrementalStream): + ''' + https://docs.github.com/en/rest/commits/commits#list-commits-on-a-repository + ''' + tap_stream_id = "commit_pull_request" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" + key_properties = ["commit_sha","pull_request_id"] + path = "commits/{}/pulls" + id_keys = ["sha"] + inherit_parent_fields = [("commit_sha","sha"), ("_sdc_repository","_sdc_repository")] + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + if not record: return + record['pull_request_id'] = self.get_field(record,['id']) + + class UserEmail(IncrementalStream): ''' @@ -888,6 +908,7 @@ def add_fields_at_1st_level(self, record, parent_record = None): "commits": Commits, "commit_files": CommitFiles, "commit_parents": CommitParents, + "commit_pull_request": CommitPullRequest, "comments": Comments, "issues": Issues, "assignees": Assignees, From 3a91b294410fd64d689ff4250ac36b5088674297 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Tue, 11 Apr 2023 17:44:36 +0900 Subject: [PATCH 10/37] fix:Create new shared schema for teams --- tap_github/schemas/branches.json | 6 +-- tap_github/schemas/shared/teams.json | 61 ++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 3 deletions(-) create mode 100644 tap_github/schemas/shared/teams.json diff --git a/tap_github/schemas/branches.json b/tap_github/schemas/branches.json index ff1d55cf..6dc74a83 100644 --- a/tap_github/schemas/branches.json +++ b/tap_github/schemas/branches.json @@ -102,7 +102,7 @@ "teams": { "type": ["null", "array"], "items": { - "$ref": "teams.json#/" + "$ref": "shared/teams.json#/" } }, "apps": { @@ -191,7 +191,7 @@ "teams": { "type": ["null", "array"], "items": { - "$ref": "teams.json#/" + "$ref": "shared/teams.json#/" } }, "apps": { @@ -285,7 +285,7 @@ "teams": { "type": ["null", "array"], "items": { - "$ref": "teams.json#/" + "$ref": "shared/teams.json#/" } }, "apps": { diff --git a/tap_github/schemas/shared/teams.json b/tap_github/schemas/shared/teams.json new file mode 100644 index 00000000..805e1a5c --- /dev/null +++ b/tap_github/schemas/shared/teams.json @@ -0,0 +1,61 @@ +{ + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "permissions": { + "type": ["null", "object"], + "properties": { + "pull": { + "type": ["null", "boolean"] + }, + "triage": { + "type": ["null", "boolean"] + }, + "push": { + "type": ["null", "boolean"] + }, + "maintain": { + "type": ["null", "boolean"] + }, + "admin": { + "type": ["null", "boolean"] + } + } + }, + "name": { + "type": ["null", "string"] + }, + "slug": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "privacy": { + "type": ["null", "string"] + }, + "permission": { + "type": ["null", "string"] + }, + "members_url": { + "type": ["null", "string"] + }, + "repositories_url": { + "type": ["null", "string"] + }, + "parent": { + "type": ["null", "object", "string"] + } + } +} \ No newline at end of file From edf3bed82b0bbe9c5cd60feaccfc4a1cd270871b Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Wed, 12 Apr 2023 14:08:25 +0900 Subject: [PATCH 11/37] fix:Make fixes for new tables --- tap_github/streams.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index 222dbb8e..5528b9e1 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -170,6 +170,8 @@ def get_child_records(self, # Loop through all the records of response for record in records: record['_sdc_repository'] = repo_path + for column, field in child_object.inherit_parent_fields: + record[column] = parent_record.get(field) child_object.add_fields_at_1st_level(record = record, parent_record = parent_record) with singer.Transformer() as transformer: @@ -383,7 +385,8 @@ def sync_endpoint(self, start_date, record.get(self.replication_keys), stream_to_sync, - selected_stream_ids) + selected_stream_ids, + parent_record = record) else: LOGGER.warning("Skipping this record for %s stream with %s = %s as it is missing replication key %s.", self.tap_stream_id, self.key_properties, record[self.key_properties], self.replication_keys) @@ -688,6 +691,7 @@ class CommitFiles(IncrementalStream): no_path = True inherit_parent_fields = [("commit_sha","sha"), ("_sdc_repository","_sdc_repository")] inherit_array_parent_fields = "files" + parent = 'commits' class CommitParents(IncrementalStream): ''' @@ -700,6 +704,7 @@ class CommitParents(IncrementalStream): no_path = True inherit_parent_fields = [("children_sha","sha"), ("_sdc_repository","_sdc_repository")] inherit_array_parent_fields = "parents" + parent = 'commits' class CommitPullRequest(IncrementalStream): ''' @@ -710,8 +715,10 @@ class CommitPullRequest(IncrementalStream): replication_keys = "updated_at" key_properties = ["commit_sha","pull_request_id"] path = "commits/{}/pulls" + use_repository = True id_keys = ["sha"] inherit_parent_fields = [("commit_sha","sha"), ("_sdc_repository","_sdc_repository")] + parent = 'commits' def add_fields_at_1st_level(self, record, parent_record = None): """ From 97d37462c63290833fd1763cc5306bdac674699b Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Wed, 12 Apr 2023 14:09:24 +0900 Subject: [PATCH 12/37] fix:Make fixes for array inherited fields --- tap_github/streams.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index 5528b9e1..015d2e45 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -206,7 +206,7 @@ def get_child_records(self, records = [] extraction_time = singer.utils.now() if child_object.inherit_array_parent_fields: - for record in parent_record.get(child_object.inherit_array_parent_fields): + for record in parent_record.get(child_object.inherit_array_parent_fields,[]): records.append(record) else: records.append({}) for record in records: @@ -700,7 +700,7 @@ class CommitParents(IncrementalStream): tap_stream_id = "commit_parents" replication_method = "INCREMENTAL" replication_keys = "updated_at" - key_properties = ["child_sha","sha"] + key_properties = ["children_sha","sha"] no_path = True inherit_parent_fields = [("children_sha","sha"), ("_sdc_repository","_sdc_repository")] inherit_array_parent_fields = "parents" From 89cbcf3ff04647d78771f1415a2baf581628fe0a Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Wed, 12 Apr 2023 14:25:39 +0900 Subject: [PATCH 13/37] feat: Implement deployments table --- tap_github/schemas/branches.json | 59 +----------------------- tap_github/schemas/deployments.json | 69 +++++++++++++++++++++++++++++ tap_github/schemas/shared/app.json | 63 ++++++++++++++++++++++++++ tap_github/streams.py | 17 +++++++ 4 files changed, 150 insertions(+), 58 deletions(-) create mode 100644 tap_github/schemas/deployments.json create mode 100644 tap_github/schemas/shared/app.json diff --git a/tap_github/schemas/branches.json b/tap_github/schemas/branches.json index 6dc74a83..ebac9b9d 100644 --- a/tap_github/schemas/branches.json +++ b/tap_github/schemas/branches.json @@ -108,64 +108,7 @@ "apps": { "type": ["null", "array"], "items": { - "id": { - "type": ["null", "number"] - }, - "slug": { - "type": ["null", "string"] - }, - "node_id": { - "type": ["null", "string"] - }, - "owner": { - "$ref": "shared/user.json#/" - }, - "name": { - "type": ["null", "string"] - }, - "description": { - "type": ["null", "string"] - }, - "external_url": { - "type": ["null", "string"], - "format": "uri" - }, - "html_url": { - "type": ["null", "string"], - "format": "uri" - }, - "created_at": { - "type": ["null", "string"], - "format": "date-time" - }, - "updated_at": { - "type": ["null", "string"], - "format": "date-time" - }, - "permissions": { - "$ref": "shared/pull_permissions.json#/" - }, - "events": { - "type": ["null", "array"], - "items": { - "type": ["null", "string"] - } - }, - "installations_count": { - "type": ["null", "integer"] - }, - "client_id": { - "type": ["null", "string"] - }, - "client_secret": { - "type": ["null", "string"] - }, - "webhook_secret": { - "type": ["null", "string"] - }, - "pem": { - "type": ["null", "string"] - } + "$ref": "shared/app.json#/" } }, "users_url": { diff --git a/tap_github/schemas/deployments.json b/tap_github/schemas/deployments.json new file mode 100644 index 00000000..46154008 --- /dev/null +++ b/tap_github/schemas/deployments.json @@ -0,0 +1,69 @@ +{ + "type": ["null", "object"], + "additionalProperties": false, + "properties": { + "_sdc_repository": { + "type": ["string"] + }, + "url": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "sha": { + "type": ["null", "string"] + }, + "ref": { + "type": ["null", "string"] + }, + "task": { + "type": ["null", "string"] + }, + "payload": { + "type": ["null", "object", "string"] + }, + "original_environment": { + "type": ["null", "string"] + }, + "environment": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "creator_id": { + "type": ["null", "integer"] + }, + "creator": { + "$ref": "shared/user.json#/" + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "statuses_url": { + "type": ["null", "string"] + }, + "repository_url": { + "type": ["null", "string"] + }, + "transient_environment": { + "type": ["null", "boolean"] + }, + "production_environment": { + "type": ["null", "boolean"] + }, + "performed_via_github_app": { + "$ref": "shared/app.json#/" + } + } + } + \ No newline at end of file diff --git a/tap_github/schemas/shared/app.json b/tap_github/schemas/shared/app.json new file mode 100644 index 00000000..7045891c --- /dev/null +++ b/tap_github/schemas/shared/app.json @@ -0,0 +1,63 @@ +{ + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "number"] + }, + "slug": { + "type": ["null", "string"] + }, + "node_id": { + "type": ["null", "string"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "name": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "external_url": { + "type": ["null", "string"], + "format": "uri" + }, + "html_url": { + "type": ["null", "string"], + "format": "uri" + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "events": { + "type": ["null", "array"], + "items": { + "type": ["null", "string"] + } + }, + "installations_count": { + "type": ["null", "integer"] + }, + "client_id": { + "type": ["null", "string"] + }, + "client_secret": { + "type": ["null", "string"] + }, + "webhook_secret": { + "type": ["null", "string"] + }, + "pem": { + "type": ["null", "string"] + } + } +} \ No newline at end of file diff --git a/tap_github/streams.py b/tap_github/streams.py index 015d2e45..65e2b1d8 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -909,6 +909,22 @@ def add_fields_at_1st_level(self, record, parent_record = None): if not record: return record['owner_id'] = self.get_field(record,['owner','id']) +class Deployments(FullTableStream): + ''' + https://docs.github.com/en/rest/deployments/deployments#list-deployments + ''' + tap_stream_id = "deployments" + replication_method = "FULL_TABLE" + key_properties = ["id"] + path = "deployments" + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + if not record: return + record['creator_id'] = self.get_field(record,['creator','id']) + # Dictionary of the stream classes STREAMS = { "repositories": Repositories, @@ -940,4 +956,5 @@ def add_fields_at_1st_level(self, record, parent_record = None): "collaborators": Collaborators, "stargazers": StarGazers, "commit_users_emails": UserEmail, + "deployments": Deployments, } From 5c5aac9aa8688cdad2ae43348161d559dc3090bf Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Wed, 12 Apr 2023 14:39:35 +0900 Subject: [PATCH 14/37] feat: Implement deployment statuses table --- tap_github/schemas/deployment_statuses.json | 59 +++++++++++++++++++++ tap_github/schemas/deployments.json | 1 - tap_github/streams.py | 25 ++++++++- 3 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 tap_github/schemas/deployment_statuses.json diff --git a/tap_github/schemas/deployment_statuses.json b/tap_github/schemas/deployment_statuses.json new file mode 100644 index 00000000..17da4d78 --- /dev/null +++ b/tap_github/schemas/deployment_statuses.json @@ -0,0 +1,59 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["string"] + }, + "url": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "state": { + "type": ["null", "string"] + }, + "creator_id": { + "type": ["null", "integer"] + }, + "creator": { + "$ref": "shared/user.json#/" + }, + "description": { + "type": ["null", "string"] + }, + "environment": { + "type": ["null", "string"] + }, + "target_url": { + "type": ["null", "string"] + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "deployment_url": { + "type": ["null", "string"] + }, + "repository_url": { + "type": ["null", "string"] + }, + "environment_url": { + "type": ["null", "string"] + }, + "log_url": { + "type": ["null", "string"] + }, + "performed_via_github_app": { + "$ref": "shared/app.json#/" + } + } + } + \ No newline at end of file diff --git a/tap_github/schemas/deployments.json b/tap_github/schemas/deployments.json index 46154008..76d00c67 100644 --- a/tap_github/schemas/deployments.json +++ b/tap_github/schemas/deployments.json @@ -1,6 +1,5 @@ { "type": ["null", "object"], - "additionalProperties": false, "properties": { "_sdc_repository": { "type": ["string"] diff --git a/tap_github/streams.py b/tap_github/streams.py index 65e2b1d8..70cb113a 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -916,7 +916,28 @@ class Deployments(FullTableStream): tap_stream_id = "deployments" replication_method = "FULL_TABLE" key_properties = ["id"] - path = "deployments" + path = "deployments?sort=created_at&direction=desc" + children = ["deployment_statuses"] + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + if not record: return + record['creator_id'] = self.get_field(record,['creator','id']) + +class DeploymentStatuses(FullTableStream): + ''' + https://docs.github.com/en/rest/deployments/statuses#list-deployment-statuses + ''' + tap_stream_id = "deployment_statuses" + replication_method = "FULL_TABLE" + use_repository = True + key_properties = ["deployment_id","id"] + path = "deployments/{}/statuses" + id_keys = ["id"] + inherit_parent_fields = [("deployment_id","id")] + parent = 'deployments' def add_fields_at_1st_level(self, record, parent_record = None): """ @@ -957,4 +978,6 @@ def add_fields_at_1st_level(self, record, parent_record = None): "stargazers": StarGazers, "commit_users_emails": UserEmail, "deployments": Deployments, + "deployment_statuses": DeploymentStatuses, + } From c9e05427d21628ac5d4dfd84b826c52959366969 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Wed, 12 Apr 2023 14:54:53 +0900 Subject: [PATCH 15/37] feat: Implement issue_assignees table --- tap_github/schemas/issue_assignees.json | 75 +++++++++++++++++++++++++ tap_github/schemas/issues.json | 6 -- tap_github/streams.py | 23 +++++++- 3 files changed, 97 insertions(+), 7 deletions(-) create mode 100644 tap_github/schemas/issue_assignees.json diff --git a/tap_github/schemas/issue_assignees.json b/tap_github/schemas/issue_assignees.json new file mode 100644 index 00000000..2542d54f --- /dev/null +++ b/tap_github/schemas/issue_assignees.json @@ -0,0 +1,75 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["string"] + }, + "issue_id": { + "type": ["null", "integer"] + }, + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] + } + } + } + \ No newline at end of file diff --git a/tap_github/schemas/issues.json b/tap_github/schemas/issues.json index 93365708..e55ed930 100644 --- a/tap_github/schemas/issues.json +++ b/tap_github/schemas/issues.json @@ -57,12 +57,6 @@ "assignee": { "$ref": "shared/user.json#/" }, - "assignees": { - "type": ["null", "array"], - "items": { - "$ref": "shared/user.json#/" - } - }, "milestone": { "type": ["null", "object"], "properties": { diff --git a/tap_github/streams.py b/tap_github/streams.py index 70cb113a..1ea3eb34 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -559,6 +559,7 @@ class PullRequests(IncrementalOrderedStream): key_properties = ["id"] path = "pulls?state=all&sort=updated&direction=desc" children = ['reviews', 'review_comments', 'pr_commits'] + has_children = True pk_child_fields = ["number"] class ProjectCards(IncrementalStream): @@ -599,6 +600,7 @@ class Projects(IncrementalStream): path = "projects?state=all" tap_stream_id = "projects" children = ["project_columns"] + has_children = True child_objects = [ProjectColumns()] class TeamMemberships(FullTableStream): @@ -651,6 +653,7 @@ class Teams(FullTableStream): path = "orgs/{}/teams" use_organization = True children = ["team_members"] + has_children = True pk_child_fields = ['slug'] class Commits(IncrementalStream): @@ -663,6 +666,7 @@ class Commits(IncrementalStream): key_properties = ["sha"] path = "commits" children= ["commit_users_emails", "commit_files", "commit_parents", "commit_pull_request"] + has_children = True filter_param = True def add_fields_at_1st_level(self, record, parent_record = None): @@ -762,6 +766,21 @@ class Issues(IncrementalOrderedStream): key_properties = ["id"] filter_param = True path = "issues?state=all&sort=updated&direction=desc" + children = ["issue_assignees"] + has_children = True + +class IssueAssignees(IncrementalOrderedStream): + ''' + https://docs.github.com/en/rest/issues/issues#list-repository-issues + ''' + tap_stream_id = "issue_assignees" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" + key_properties = ["issue_id","id"] + no_path = True + inherit_parent_fields = [("issue_id","id"), ("_sdc_repository","_sdc_repository")] + inherit_array_parent_fields = "assignees" + parent = 'issues' class Assignees(FullTableStream): ''' @@ -781,6 +800,7 @@ class Releases(FullTableStream): key_properties = ["id"] path = "releases?sort=created_at&direction=desc" children = ["release_assets"] + has_children = True class ReleaseAssets(FullTableStream): ''' @@ -918,6 +938,7 @@ class Deployments(FullTableStream): key_properties = ["id"] path = "deployments?sort=created_at&direction=desc" children = ["deployment_statuses"] + has_children = True def add_fields_at_1st_level(self, record, parent_record = None): """ @@ -936,7 +957,7 @@ class DeploymentStatuses(FullTableStream): key_properties = ["deployment_id","id"] path = "deployments/{}/statuses" id_keys = ["id"] - inherit_parent_fields = [("deployment_id","id")] + inherit_parent_fields = [("deployment_id","id"),("_sdc_repository","_sdc_repository")] parent = 'deployments' def add_fields_at_1st_level(self, record, parent_record = None): From 9f525f6425e59e4ba3ad4d8809d0404463df71ac Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Wed, 12 Apr 2023 15:05:03 +0900 Subject: [PATCH 16/37] feat: Implement issue_labels table and change former issue_labels to labels --- tap_github/schemas/issue_labels.json | 3 +++ tap_github/schemas/issues.json | 29 ---------------------------- tap_github/schemas/labels.json | 29 ++++++++++++++++++++++++++++ tap_github/streams.py | 23 ++++++++++++++++++---- 4 files changed, 51 insertions(+), 33 deletions(-) create mode 100644 tap_github/schemas/labels.json diff --git a/tap_github/schemas/issue_labels.json b/tap_github/schemas/issue_labels.json index 32a097df..d97ee204 100644 --- a/tap_github/schemas/issue_labels.json +++ b/tap_github/schemas/issue_labels.json @@ -4,6 +4,9 @@ "_sdc_repository": { "type": ["null", "string"] }, + "issue_id": { + "type": ["null", "integer"] + }, "id": { "type": ["null", "number"] }, diff --git a/tap_github/schemas/issues.json b/tap_github/schemas/issues.json index e55ed930..8667a62d 100644 --- a/tap_github/schemas/issues.json +++ b/tap_github/schemas/issues.json @@ -9,35 +9,6 @@ "url": { "type": ["null", "string"] }, - "labels": { - "type": ["null", "array"], - "items": { - "type": ["null", "object"], - "properties": { - "id": { - "type": ["null", "integer"] - }, - "node_id": { - "type": ["null", "string"] - }, - "url": { - "type": ["null", "string"] - }, - "name": { - "type": ["null", "string"] - }, - "description": { - "type": ["null", "string"] - }, - "color": { - "type": ["null", "string"] - }, - "default": { - "type": ["null", "boolean"] - } - } - } - }, "repository_url": { "type": ["null", "string"] }, diff --git a/tap_github/schemas/labels.json b/tap_github/schemas/labels.json new file mode 100644 index 00000000..32a097df --- /dev/null +++ b/tap_github/schemas/labels.json @@ -0,0 +1,29 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "color": { + "type": ["null", "string"] + }, + "default": { + "type": ["null", "boolean"] + } + } +} diff --git a/tap_github/streams.py b/tap_github/streams.py index 1ea3eb34..6fb9a146 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -766,7 +766,7 @@ class Issues(IncrementalOrderedStream): key_properties = ["id"] filter_param = True path = "issues?state=all&sort=updated&direction=desc" - children = ["issue_assignees"] + children = ["issue_assignees","issue_labels"] has_children = True class IssueAssignees(IncrementalOrderedStream): @@ -782,6 +782,19 @@ class IssueAssignees(IncrementalOrderedStream): inherit_array_parent_fields = "assignees" parent = 'issues' +class IssueLabels(IncrementalOrderedStream): + ''' + https://docs.github.com/en/rest/issues/issues#list-repository-issues + ''' + tap_stream_id = "issue_labels" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" + key_properties = ["issue_id","id"] + no_path = True + inherit_parent_fields = [("issue_id","id"), ("_sdc_repository","_sdc_repository")] + inherit_array_parent_fields = "labels" + parent = 'issues' + class Assignees(FullTableStream): ''' https://docs.github.com/en/rest/issues/assignees#list-assignees @@ -838,11 +851,11 @@ def add_fields_at_1st_level(self, record, parent_record = None): """ if not record: return record['commit_sha'] = self.get_field(record,['commit','sha']) -class IssueLabels(FullTableStream): +class Labels(FullTableStream): ''' https://docs.github.com/en/rest/issues/labels#list-labels-for-a-repository ''' - tap_stream_id = "issue_labels" + tap_stream_id = "labels" replication_method = "FULL_TABLE" key_properties = ["id"] path = "labels" @@ -976,11 +989,13 @@ def add_fields_at_1st_level(self, record, parent_record = None): "commit_pull_request": CommitPullRequest, "comments": Comments, "issues": Issues, + "issue_assignees": IssueAssignees, + "issue_labels": IssueLabels, "assignees": Assignees, "releases": Releases, "release_assets": ReleaseAssets, "branches": Branches, - "issue_labels": IssueLabels, + "labels": Labels, "issue_events": IssueEvents, "events": Events, "commit_comments": CommitComments, From c1558edb2843313d66661d213afdb3e139d5ccc3 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Wed, 12 Apr 2023 15:38:52 +0900 Subject: [PATCH 17/37] feat: Implement repository_teams table --- tap_github/schemas/repository_teams.json | 48 ++++++++++++++++++++++++ tap_github/streams.py | 10 +++++ 2 files changed, 58 insertions(+) create mode 100644 tap_github/schemas/repository_teams.json diff --git a/tap_github/schemas/repository_teams.json b/tap_github/schemas/repository_teams.json new file mode 100644 index 00000000..88be83e4 --- /dev/null +++ b/tap_github/schemas/repository_teams.json @@ -0,0 +1,48 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "name": { + "type": ["null", "string"] + }, + "slug": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "privacy": { + "type": ["null", "string"] + }, + "permission": { + "type": ["null", "string"] + }, + "members_url": { + "type": ["null", "string"] + }, + "repositories_url": { + "type": ["null", "string"] + }, + "parent": { + "type": ["null", "object", "string"] + } + } + } + \ No newline at end of file diff --git a/tap_github/streams.py b/tap_github/streams.py index 6fb9a146..4c7bd8b1 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -942,6 +942,15 @@ def add_fields_at_1st_level(self, record, parent_record = None): if not record: return record['owner_id'] = self.get_field(record,['owner','id']) +class RepositoryTeams(FullTableStream): + ''' + https://docs.github.com/en/rest/repos/repos#list-repository-teams + ''' + tap_stream_id = "repository_teams" + replication_method = "FULL_TABLE" + key_properties = ["_sdc_repository","id"] + path = "teams" + class Deployments(FullTableStream): ''' https://docs.github.com/en/rest/deployments/deployments#list-deployments @@ -983,6 +992,7 @@ def add_fields_at_1st_level(self, record, parent_record = None): # Dictionary of the stream classes STREAMS = { "repositories": Repositories, + "repository_teams": RepositoryTeams, "commits": Commits, "commit_files": CommitFiles, "commit_parents": CommitParents, From 0fb299dd16a96e7f256ccf77a97fa2b63c4c8aa6 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Wed, 12 Apr 2023 16:44:53 +0900 Subject: [PATCH 18/37] feat: Implement repository_topics table --- tap_github/schemas/repository_topics.json | 12 ++++++++++++ tap_github/streams.py | 22 +++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 tap_github/schemas/repository_topics.json diff --git a/tap_github/schemas/repository_topics.json b/tap_github/schemas/repository_topics.json new file mode 100644 index 00000000..16107085 --- /dev/null +++ b/tap_github/schemas/repository_topics.json @@ -0,0 +1,12 @@ +{ + "type": ["null", "object"], + "properties": { + "repository": { + "type": ["null", "string"] + }, + "topic": { + "type": ["null", "string"] + } + } + } + \ No newline at end of file diff --git a/tap_github/streams.py b/tap_github/streams.py index 4c7bd8b1..5b070781 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -72,6 +72,7 @@ class Stream: parent = None inherit_parent_fields = [] inherit_array_parent_fields = "" + custom_column_name = "" no_path = False def build_url(self, base_url, repo_path, bookmark): @@ -207,7 +208,10 @@ def get_child_records(self, extraction_time = singer.utils.now() if child_object.inherit_array_parent_fields: for record in parent_record.get(child_object.inherit_array_parent_fields,[]): - records.append(record) + if col_name := child_object.custom_column_name: + records.append({col_name: record}) + else: + records.append(record) else: records.append({}) for record in records: for column, field in child_object.inherit_parent_fields: @@ -934,6 +938,8 @@ class Repositories(FullTableStream): key_properties = ["id"] use_organization = True path = "orgs/{}/repos" + children = ["repository_topics"] + has_children = True def add_fields_at_1st_level(self, record, parent_record = None): """ @@ -951,6 +957,19 @@ class RepositoryTeams(FullTableStream): key_properties = ["_sdc_repository","id"] path = "teams" +class RepositoryTopics(FullTableStream): + ''' + https://docs.github.com/en/rest/repos/repos#list-repository-teams + ''' + tap_stream_id = "repository_topics" + replication_method = "FULL_TABLE" + key_properties = ["repository","topic"] + no_path = True + inherit_parent_fields = [("repository","full_name")] + inherit_array_parent_fields = "topics" + custom_column_name = "topic" + parent = 'repositories' + class Deployments(FullTableStream): ''' https://docs.github.com/en/rest/deployments/deployments#list-deployments @@ -993,6 +1012,7 @@ def add_fields_at_1st_level(self, record, parent_record = None): STREAMS = { "repositories": Repositories, "repository_teams": RepositoryTeams, + "repository_topics": RepositoryTopics, "commits": Commits, "commit_files": CommitFiles, "commit_parents": CommitParents, From c1918e7aa4b59e8ad9256704e1255aa793e2a20a Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Wed, 12 Apr 2023 18:13:39 +0900 Subject: [PATCH 19/37] feat: Add workflows & workflow_runs --- tap_github/schemas/workflow_runs.json | 251 ++++++++++++++++++++++++++ tap_github/schemas/workflows.json | 45 +++++ tap_github/streams.py | 39 +++- 3 files changed, 334 insertions(+), 1 deletion(-) create mode 100644 tap_github/schemas/workflow_runs.json create mode 100644 tap_github/schemas/workflows.json diff --git a/tap_github/schemas/workflow_runs.json b/tap_github/schemas/workflow_runs.json new file mode 100644 index 00000000..05f4c38e --- /dev/null +++ b/tap_github/schemas/workflow_runs.json @@ -0,0 +1,251 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "check_suite_id": { + "type": ["null", "integer"] + }, + "check_suite_node_id": { + "type": ["null", "string"] + }, + "head_branch": { + "type": ["null", "string"] + }, + "head_sha": { + "type": ["null", "string"] + }, + "path": { + "type": ["null", "string"] + }, + "run_number": { + "type": ["null", "integer"] + }, + "run_attempt": { + "type": ["null", "integer"] + }, + "referenced_workflows": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"], + "properties": { + "path": { + "type": ["null", "string"] + }, + "sha": { + "type": ["null", "string"] + }, + "ref": { + "type": ["null", "string"] + } + } + } + }, + "event": { + "type": ["null", "string"] + }, + "status": { + "type": ["null", "string"] + }, + "conclusion": { + "type": ["null", "string"] + }, + "workflow_id": { + "type": ["null", "integer"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "pull_requests": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "number": { + "type": ["null", "integer"] + }, + "url": { + "type": ["null", "string"] + }, + "head": { + "type": ["null", "object"], + "properties": { + "ref": { + "type": ["null", "string"] + }, + "sha": { + "type": ["null", "string"] + }, + "repo": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + } + } + } + } + }, + "base": { + "type": ["null", "object"], + "properties": { + "ref": { + "type": ["null", "string"] + }, + "sha": { + "type": ["null", "string"] + }, + "repo": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + } + } + } + } + } + } + } + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "actor_id": { + "type": ["null", "integer"] + }, + "actor": { + "$ref": "shared/user.json#/" + }, + "triggering_actor_id": { + "type": ["null", "integer"] + }, + "triggering_actor": { + "$ref": "shared/user.json#/" + }, + "run_started_at": { + "type": ["null", "string"] + }, + "jobs_url": { + "type": ["null", "string"] + }, + "logs_url": { + "type": ["null", "string"] + }, + "check_suite_url": { + "type": ["null", "string"] + }, + "artifacts_url": { + "type": ["null", "string"] + }, + "cancel_url": { + "type": ["null", "string"] + }, + "rerun_url": { + "type": ["null", "string"] + }, + "previous_attempt_url": { + "type": ["null", "string"] + }, + "workflow_url": { + "type": ["null", "string"] + }, + "head_commit": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "string"] + }, + "tree_id": { + "type": ["null", "string"] + }, + "message": { + "type": ["null", "string"] + }, + "timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "author": { + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + } + } + }, + "committer": { + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + } + } + }, + "repo": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + } + } + } + } + }, + "repository_id": { + "type": ["null", "integer"] + }, + "head_repository_id": { + "type": ["null", "integer"] + }, + "display_title": { + "type": ["null", "string"] + } + } + } + \ No newline at end of file diff --git a/tap_github/schemas/workflows.json b/tap_github/schemas/workflows.json new file mode 100644 index 00000000..ad0deb48 --- /dev/null +++ b/tap_github/schemas/workflows.json @@ -0,0 +1,45 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "path": { + "type": ["null", "string"] + }, + "state": { + "type": ["null", "string"] + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "badge_url": { + "type": ["null", "string"] + }, + "deleted_at": { + "type": ["null", "string"], + "format": "date-time" + } + } + } + \ No newline at end of file diff --git a/tap_github/streams.py b/tap_github/streams.py index 5b070781..a1595eb4 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -74,6 +74,7 @@ class Stream: inherit_array_parent_fields = "" custom_column_name = "" no_path = False + result_path = "" def build_url(self, base_url, repo_path, bookmark): """ @@ -165,6 +166,7 @@ def get_child_records(self, stream = child_object.tap_stream_id ): records = response.json() + if child_object.result_path: records = records.get(child_object.result_path,[]) extraction_time = singer.utils.now() if isinstance(records, list): @@ -278,6 +280,7 @@ def sync_endpoint(self, stream = self.tap_stream_id ): records = response.json() + if self.result_path: records = records.get(self.result_path,[]) extraction_time = singer.utils.now() # Loop through all records for record in records: @@ -350,6 +353,7 @@ def sync_endpoint(self, stream = self.tap_stream_id ): records = response.json() + if self.result_path: records = records.get(self.result_path,[]) extraction_time = singer.utils.now() # Loop through all records for record in records: @@ -435,6 +439,7 @@ def sync_endpoint(self, stream = self.tap_stream_id ): records = response.json() + if self.result_path: records = records.get(self.result_path,[]) extraction_time = singer.utils.now() for record in records: record['_sdc_repository'] = repo_path @@ -1008,6 +1013,37 @@ def add_fields_at_1st_level(self, record, parent_record = None): if not record: return record['creator_id'] = self.get_field(record,['creator','id']) +class Workflows(FullTableStream): + ''' + https://docs.github.com/en/rest/actions/workflows?apiVersion=2022-11-28#list-repository-workflows + ''' + tap_stream_id = "workflows" + replication_method = "FULL_TABLE" + use_repository = True + key_properties = ["id"] + path = "actions/workflows" + result_path = "workflows" + +class WorkflowRuns(FullTableStream): + ''' + https://docs.github.com/en/rest/actions/workflows?apiVersion=2022-11-28#list-repository-workflows + ''' + tap_stream_id = "workflow_runs" + replication_method = "FULL_TABLE" + use_repository = True + key_properties = ["id"] + path = "actions/runs" + result_path = "workflow_runs" + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + if not record: return + record['actor_id'] = self.get_field(record,['actor','id']) + record['triggering_actor_id'] = self.get_field(record,['triggering_actor','id']) + record['repository_id'] = self.get_field(record,['repository','id']) + # Dictionary of the stream classes STREAMS = { "repositories": Repositories, @@ -1045,5 +1081,6 @@ def add_fields_at_1st_level(self, record, parent_record = None): "commit_users_emails": UserEmail, "deployments": Deployments, "deployment_statuses": DeploymentStatuses, - + "workflows": Workflows, + "workflow_runs": WorkflowRuns } From 594768f3ad4d259529fcb36011a7b518e0387cf7 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Wed, 12 Apr 2023 18:36:04 +0900 Subject: [PATCH 20/37] fix: Add incrmental sync for workflow_runs --- tap_github/streams.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index a1595eb4..7f19cbc2 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -63,6 +63,7 @@ class Stream: key_properties = [] path = None filter_param = False + filter_param_custom = "" id_keys = [] use_organization = False children = [] @@ -83,6 +84,9 @@ def build_url(self, base_url, repo_path, bookmark): if self.filter_param: # Add the since parameter for incremental streams query_string = '?since={}'.format(bookmark) + elif self.filter_param_custom: + # Add additional custom filter for incremental streams + query_string = f'?{self.filter_param_custom}{bookmark}' else: query_string = '' @@ -1024,16 +1028,18 @@ class Workflows(FullTableStream): path = "actions/workflows" result_path = "workflows" -class WorkflowRuns(FullTableStream): +class WorkflowRuns(IncrementalStream): ''' https://docs.github.com/en/rest/actions/workflows?apiVersion=2022-11-28#list-repository-workflows ''' tap_stream_id = "workflow_runs" - replication_method = "FULL_TABLE" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" use_repository = True key_properties = ["id"] path = "actions/runs" result_path = "workflow_runs" + filter_param_custom = "created:>=" def add_fields_at_1st_level(self, record, parent_record = None): """ From a6e384937d401317c231bc742f32ccce9819db16 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Fri, 14 Apr 2023 09:05:56 +0900 Subject: [PATCH 21/37] fix:Add workflow_pull_request table --- .../schemas/workflow_pull_requests.json | 77 +++++++++++++++++++ tap_github/streams.py | 28 ++++++- 2 files changed, 103 insertions(+), 2 deletions(-) create mode 100644 tap_github/schemas/workflow_pull_requests.json diff --git a/tap_github/schemas/workflow_pull_requests.json b/tap_github/schemas/workflow_pull_requests.json new file mode 100644 index 00000000..99ec125b --- /dev/null +++ b/tap_github/schemas/workflow_pull_requests.json @@ -0,0 +1,77 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "workflow_run_id": { + "type": ["null", "integer"] + }, + "id": { + "type": ["null", "integer"] + }, + "number": { + "type": ["null", "integer"] + }, + "url": { + "type": ["null", "string"] + }, + "head_sha": { + "type": ["null", "string"] + }, + "head": { + "type": ["null", "object"], + "properties": { + "ref": { + "type": ["null", "string"] + }, + "sha": { + "type": ["null", "string"] + }, + "repo": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + } + } + } + } + }, + "base_sha": { + "type": ["null", "string"] + }, + "base": { + "type": ["null", "object"], + "properties": { + "ref": { + "type": ["null", "string"] + }, + "sha": { + "type": ["null", "string"] + }, + "repo": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + } + } + } + } + } +} + } + \ No newline at end of file diff --git a/tap_github/streams.py b/tap_github/streams.py index 7f19cbc2..52e43b28 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -1039,7 +1039,9 @@ class WorkflowRuns(IncrementalStream): key_properties = ["id"] path = "actions/runs" result_path = "workflow_runs" - filter_param_custom = "created:>=" + filter_param_custom = "created=>=" + children = ["workflow_pull_requests"] + has_children = True def add_fields_at_1st_level(self, record, parent_record = None): """ @@ -1050,6 +1052,27 @@ def add_fields_at_1st_level(self, record, parent_record = None): record['triggering_actor_id'] = self.get_field(record,['triggering_actor','id']) record['repository_id'] = self.get_field(record,['repository','id']) +class WorkflowPullRequests(IncrementalStream): + ''' + https://docs.github.com/en/rest/actions/workflows?apiVersion=2022-11-28#list-repository-workflows + ''' + tap_stream_id = "workflow_pull_requests" + replication_method = "INCREMENTAL" + replication_keys = "updated_at" + key_properties = ["workflow_run_id","id"] + no_path = True + inherit_parent_fields = [("workflow_run_id","id"), ("_sdc_repository","_sdc_repository")] + inherit_array_parent_fields = "pull_requests" + parent = 'workflow_runs' + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + if not record: return + record['head_sha'] = self.get_field(record,['head','sha']) + record['base_sha'] = self.get_field(record,['base','sha']) + # Dictionary of the stream classes STREAMS = { "repositories": Repositories, @@ -1088,5 +1111,6 @@ def add_fields_at_1st_level(self, record, parent_record = None): "deployments": Deployments, "deployment_statuses": DeploymentStatuses, "workflows": Workflows, - "workflow_runs": WorkflowRuns + "workflow_runs": WorkflowRuns, + "workflow_pull_requests": WorkflowPullRequests } From 14a7341133baf303b4342905cf8bd3233478f4ec Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Fri, 14 Apr 2023 12:47:01 +0900 Subject: [PATCH 22/37] fix:Add repository_topics to orgs sync tables --- tap_github/streams.py | 1 + tap_github/sync.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index 52e43b28..5de9bfe3 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -974,6 +974,7 @@ class RepositoryTopics(FullTableStream): replication_method = "FULL_TABLE" key_properties = ["repository","topic"] no_path = True + id_keys = ["full_name"] inherit_parent_fields = [("repository","full_name")] inherit_array_parent_fields = "topics" custom_column_name = "topic" diff --git a/tap_github/sync.py b/tap_github/sync.py index 38188b9a..65d8734d 100644 --- a/tap_github/sync.py +++ b/tap_github/sync.py @@ -4,7 +4,7 @@ from tap_github.streams import STREAMS LOGGER = singer.get_logger() -STREAM_TO_SYNC_FOR_ORGS = ['teams', 'team_members', 'team_memberships', 'repositories'] +STREAM_TO_SYNC_FOR_ORGS = ['teams', 'team_members', 'team_memberships', 'repositories', 'repository_topics'] def get_selected_streams(catalog): ''' From 011172e34f922b2c0d71bfd8ae6158804c71840f Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Fri, 14 Apr 2023 13:49:51 +0900 Subject: [PATCH 23/37] fix:Final fixes for new tables --- tap_github/schemas/repositories.json | 6 ------ tap_github/streams.py | 1 - 2 files changed, 7 deletions(-) diff --git a/tap_github/schemas/repositories.json b/tap_github/schemas/repositories.json index d1b24a59..d202520e 100644 --- a/tap_github/schemas/repositories.json +++ b/tap_github/schemas/repositories.json @@ -64,12 +64,6 @@ "is_template": { "type": ["null", "boolean"] }, - "topics": { - "type": ["null", "array"], - "items": { - "type": ["null", "string"] - } - }, "has_issues": { "type": ["null", "boolean"] }, diff --git a/tap_github/streams.py b/tap_github/streams.py index 5de9bfe3..37cc54d9 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -1024,7 +1024,6 @@ class Workflows(FullTableStream): ''' tap_stream_id = "workflows" replication_method = "FULL_TABLE" - use_repository = True key_properties = ["id"] path = "actions/workflows" result_path = "workflows" From 356a2952bf94d5f51baf0653de7438f0b73bcc1c Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Fri, 14 Apr 2023 14:17:58 +0900 Subject: [PATCH 24/37] feat: Set version to 2.1.0 --- CHANGELOG.md | 24 +++++++++++++++++++++++- setup.py | 2 +- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 20ce0ebd..98836a91 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,29 @@ # Changelog # 2.1.0 - * Add repositories table + * Add the following tables: + * release_asset + * branches + * commit_files + * commit_parents + * commit_pull_request + * commit_users_emails + * deployments + * deployment_statuses + * issue_assignees + * issue_labels + * repository_teams + * repository_topics + * repositories + * workflows + * workflow_runs + * workflow_pull_requests + * Add ability to create a child table with no endpoint to call, for normalizing data from parent which has a column of `array` type. + * Add ability to inherit fields from parent streams (both normal and array-like columns) + * Create option to add a custom column name when the inherited array is not an array of objects, but an array of strings. + * Add ability to look through a path on the response for the array of values. + * Add custom filter param for endpoints which require a different filter than `since`. + # 2.0.0 * Schema updates [#170](https://github.com/singer-io/tap-github/pull/170) [#169](https://github.com/singer-io/tap-github/pull/169) diff --git a/setup.py b/setup.py index b6c06fef..c55e6029 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.0', + version='2.1.0', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', From 79891f0edd170b41b2a6d746e65f24642885b553 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Fri, 14 Apr 2023 14:46:17 +0900 Subject: [PATCH 25/37] fix: Update links for table objects --- tap_github/streams.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index 37cc54d9..f7d3eb67 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -699,7 +699,7 @@ def add_fields_at_1st_level(self, record, parent_record = None): class CommitFiles(IncrementalStream): ''' - https://docs.github.com/en/rest/commits/commits#list-commits-on-a-repository + Child of "commits" - https://docs.github.com/en/rest/commits/commits#list-commits-on-a-repository ''' tap_stream_id = "commit_files" replication_method = "INCREMENTAL" @@ -712,7 +712,7 @@ class CommitFiles(IncrementalStream): class CommitParents(IncrementalStream): ''' - https://docs.github.com/en/rest/commits/commits#list-commits-on-a-repository + Child of "commits" - https://docs.github.com/en/rest/commits/commits#list-commits-on-a-repository ''' tap_stream_id = "commit_parents" replication_method = "INCREMENTAL" @@ -725,7 +725,7 @@ class CommitParents(IncrementalStream): class CommitPullRequest(IncrementalStream): ''' - https://docs.github.com/en/rest/commits/commits#list-commits-on-a-repository + https://docs.github.com/en/rest/commits/commits#list-pull-requests-associated-with-a-commit ''' tap_stream_id = "commit_pull_request" replication_method = "INCREMENTAL" @@ -745,7 +745,6 @@ def add_fields_at_1st_level(self, record, parent_record = None): record['pull_request_id'] = self.get_field(record,['id']) - class UserEmail(IncrementalStream): ''' Created from fields of Commits table @@ -784,7 +783,7 @@ class Issues(IncrementalOrderedStream): class IssueAssignees(IncrementalOrderedStream): ''' - https://docs.github.com/en/rest/issues/issues#list-repository-issues + Child of "issues" - https://docs.github.com/en/rest/issues/issues#list-repository-issues ''' tap_stream_id = "issue_assignees" replication_method = "INCREMENTAL" @@ -797,7 +796,7 @@ class IssueAssignees(IncrementalOrderedStream): class IssueLabels(IncrementalOrderedStream): ''' - https://docs.github.com/en/rest/issues/issues#list-repository-issues + Child of "issues" - https://docs.github.com/en/rest/issues/issues#list-repository-issues ''' tap_stream_id = "issue_labels" replication_method = "INCREMENTAL" @@ -830,7 +829,7 @@ class Releases(FullTableStream): class ReleaseAssets(FullTableStream): ''' - https://docs.github.com/en/rest/releases/releases#list-releases + Child of "releases" - https://docs.github.com/en/rest/releases/releases#list-releases ''' tap_stream_id = "release_assets" replication_method = "FULL_TABLE" @@ -940,7 +939,7 @@ def add_fields_at_1st_level(self, record, parent_record = None): class Repositories(FullTableStream): ''' - https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-organization-repositories + https://docs.github.com/en/rest/repos/repos#list-organization-repositories ''' tap_stream_id = "repositories" replication_method = "FULL_TABLE" @@ -968,7 +967,7 @@ class RepositoryTeams(FullTableStream): class RepositoryTopics(FullTableStream): ''' - https://docs.github.com/en/rest/repos/repos#list-repository-teams + Child of "repositories" - https://docs.github.com/en/rest/repos/repos#list-organization-repositories ''' tap_stream_id = "repository_topics" replication_method = "FULL_TABLE" @@ -1020,7 +1019,7 @@ def add_fields_at_1st_level(self, record, parent_record = None): class Workflows(FullTableStream): ''' - https://docs.github.com/en/rest/actions/workflows?apiVersion=2022-11-28#list-repository-workflows + https://docs.github.com/en/rest/actions/workflows#list-repository-workflows ''' tap_stream_id = "workflows" replication_method = "FULL_TABLE" @@ -1030,7 +1029,7 @@ class Workflows(FullTableStream): class WorkflowRuns(IncrementalStream): ''' - https://docs.github.com/en/rest/actions/workflows?apiVersion=2022-11-28#list-repository-workflows + https://docs.github.com/en/rest/actions/workflow-runs#list-workflow-runs-for-a-repository ''' tap_stream_id = "workflow_runs" replication_method = "INCREMENTAL" @@ -1054,7 +1053,7 @@ def add_fields_at_1st_level(self, record, parent_record = None): class WorkflowPullRequests(IncrementalStream): ''' - https://docs.github.com/en/rest/actions/workflows?apiVersion=2022-11-28#list-repository-workflows + Child of "workflow_runs" - https://docs.github.com/en/rest/actions/workflow-runs#list-workflow-runs-for-a-repository ''' tap_stream_id = "workflow_pull_requests" replication_method = "INCREMENTAL" From b80d012022326c9e0a7deccbd562b176afcbb596 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Mon, 17 Apr 2023 13:17:31 +0900 Subject: [PATCH 26/37] fix:Add error handling for 410 responses --- tap_github/client.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 9913a8c2..d4f32a60 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -45,6 +45,9 @@ class MovedPermanentlyError(GithubException): class ConflictError(GithubException): pass +class DisabledResourceError(GithubException): + pass + class RateLimitExceeded(GithubException): pass @@ -81,6 +84,10 @@ class TooManyRequests(GithubException): "raise_exception": ConflictError, "message": "The request could not be completed due to a conflict with the current state of the server." }, + 410: { + "raise_exception": DisabledResourceError, + "message": "The request resource is disabled for the repository." + }, 422: { "raise_exception": UnprocessableError, "message": "The request was not able to process right now." @@ -105,7 +112,7 @@ def raise_for_error(resp, source, stream, client, should_skip_404): except JSONDecodeError: response_json = {} - if error_code == 404 and should_skip_404: + if (error_code == 404 or error_code == 410) and should_skip_404: # Add not accessible stream into list. client.not_accessible_repos.add(stream) details = ERROR_CODE_EXCEPTION_MAPPING.get(error_code).get("message") @@ -196,10 +203,11 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True) self.session.headers.update(headers) resp = self.session.request(method='get', url=url, timeout=self.get_request_timeout()) if resp.status_code != 200: + LOGGER.info(f'Found a non 200 response: {url}, {resp.status_code}') raise_for_error(resp, source, stream, self, should_skip_404) timer.tags[metrics.Tag.http_status_code] = resp.status_code rate_throttling(resp, self.max_sleep_seconds) - if resp.status_code == 404: + if resp.status_code == 404 or resp.status_code == 410: # Return an empty response body since we're not raising a NotFoundException resp._content = b'{}' # pylint: disable=protected-access return resp From 8588d89fc9c49b02929532d1cd6352d96ef9be1c Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Mon, 17 Apr 2023 13:18:44 +0900 Subject: [PATCH 27/37] fix: Call API for commit_files tables --- tap_github/streams.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index f7d3eb67..078c425a 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -201,6 +201,8 @@ def get_child_records(self, else: # Write JSON response directly if it is a single record only. records['_sdc_repository'] = repo_path + for column, field in child_object.inherit_parent_fields: + records[column] = parent_record.get(field) child_object.add_fields_at_1st_level(record = records, parent_record = parent_record) with singer.Transformer() as transformer: @@ -705,10 +707,11 @@ class CommitFiles(IncrementalStream): replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["commit_sha", "filename"] - no_path = True + use_repository = True + path = "commits/{}" inherit_parent_fields = [("commit_sha","sha"), ("_sdc_repository","_sdc_repository")] - inherit_array_parent_fields = "files" parent = 'commits' + result_path = "files" class CommitParents(IncrementalStream): ''' From 872f7a9636b1d92a73fca5f4a22502fbd8bf7fca Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Mon, 17 Apr 2023 13:20:01 +0900 Subject: [PATCH 28/37] feat: Add additional table collaborator_details --- tap_github/schemas/collaborator_details.json | 104 +++++++++++++++++++ tap_github/schemas/collaborators.json | 3 - tap_github/streams.py | 15 +++ 3 files changed, 119 insertions(+), 3 deletions(-) create mode 100644 tap_github/schemas/collaborator_details.json diff --git a/tap_github/schemas/collaborator_details.json b/tap_github/schemas/collaborator_details.json new file mode 100644 index 00000000..c500845d --- /dev/null +++ b/tap_github/schemas/collaborator_details.json @@ -0,0 +1,104 @@ +{ + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "bio": { + "type": ["null", "string"] + }, + "blog": { + "type": ["null", "string"] + }, + "company": { + "type": ["null", "string"] + }, + "followers": { + "type": ["null", "integer"] + }, + "following": { + "type": ["null", "integer"] + }, + "hireable": { + "type": ["null", "boolean"] + }, + "location": { + "type": ["null", "string"] + }, + "twitter_username": { + "type": ["null", "string"] + }, + "public_gists": { + "type": ["null", "integer"] + }, + "public_repos": { + "type": ["null", "integer"] + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + } + } + } + \ No newline at end of file diff --git a/tap_github/schemas/collaborators.json b/tap_github/schemas/collaborators.json index 9f71ac07..a997a5fb 100644 --- a/tap_github/schemas/collaborators.json +++ b/tap_github/schemas/collaborators.json @@ -67,9 +67,6 @@ }, "role_name": { "type": ["null", "string"] - }, - "_sdc_repository": { - "type": ["string"] } } } diff --git a/tap_github/streams.py b/tap_github/streams.py index 078c425a..4c6f05f8 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -923,6 +923,20 @@ class Collaborators(FullTableStream): replication_method = "FULL_TABLE" key_properties = ["id"] path = "collaborators" + children = ["collaborator_details"] + has_children = True + +class CollaboratorDetails(FullTableStream): + ''' + https://docs.github.com/en/rest/users/users#get-a-user + ''' + tap_stream_id = "collaborator_details" + replication_method = "FULL_TABLE" + key_properties = ["id"] + id_keys = ["login"] + path = "users/{}" + parent = 'collaborators' + class StarGazers(FullTableStream): ''' @@ -1108,6 +1122,7 @@ def add_fields_at_1st_level(self, record, parent_record = None): "team_members": TeamMembers, "team_memberships": TeamMemberships, "collaborators": Collaborators, + "collaborator_details": CollaboratorDetails, "stargazers": StarGazers, "commit_users_emails": UserEmail, "deployments": Deployments, From f66b2498b996a150207fa9dc57a276a349b953f2 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Mon, 17 Apr 2023 14:09:35 +0900 Subject: [PATCH 29/37] fix:Add id_keys to commit_files --- tap_github/streams.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tap_github/streams.py b/tap_github/streams.py index 4c6f05f8..6eecc0e6 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -707,6 +707,7 @@ class CommitFiles(IncrementalStream): replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["commit_sha", "filename"] + id_keys = ["sha"] use_repository = True path = "commits/{}" inherit_parent_fields = [("commit_sha","sha"), ("_sdc_repository","_sdc_repository")] From 22460b4233640b62c3740a26b41690124e99dc22 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Tue, 18 Apr 2023 13:06:30 +0900 Subject: [PATCH 30/37] fix:Incremental sync final fixes --- CHANGELOG.md | 1 + ...s.json => workflow_run_pull_requests.json} | 0 tap_github/streams.py | 29 ++++++++----------- 3 files changed, 13 insertions(+), 17 deletions(-) rename tap_github/schemas/{workflow_pull_requests.json => workflow_run_pull_requests.json} (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 98836a91..e2e115eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ * workflows * workflow_runs * workflow_pull_requests + * collaborator_details * Add ability to create a child table with no endpoint to call, for normalizing data from parent which has a column of `array` type. * Add ability to inherit fields from parent streams (both normal and array-like columns) * Create option to add a custom column name when the inherited array is not an array of objects, but an array of strings. diff --git a/tap_github/schemas/workflow_pull_requests.json b/tap_github/schemas/workflow_run_pull_requests.json similarity index 100% rename from tap_github/schemas/workflow_pull_requests.json rename to tap_github/schemas/workflow_run_pull_requests.json diff --git a/tap_github/streams.py b/tap_github/streams.py index 6eecc0e6..9e56e59e 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -5,14 +5,14 @@ LOGGER = singer.get_logger() DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ' -def get_bookmark(state, repo, stream_name, bookmark_key, start_date): +def get_bookmark(state, repo, stream_name, bookmark_key, start_date, is_incremental = True): """ Return bookmark value if available in the state otherwise return start date """ - repo_stream_dict = bookmarks.get_bookmark(state, repo, stream_name) - if repo_stream_dict: - return repo_stream_dict.get(bookmark_key) - + if is_incremental: + repo_stream_dict = bookmarks.get_bookmark(state, repo, stream_name) + if repo_stream_dict: + return repo_stream_dict.get(bookmark_key) return start_date def get_schema(catalog, stream_id): @@ -155,7 +155,8 @@ def get_child_records(self, """ child_object = STREAMS[child_stream]() - child_bookmark_value = get_bookmark(state, repo_path, child_object.tap_stream_id, "since", start_date) + is_stream_incremental = child_object.replication_method == "INCREMENTAL" and child_object.replication_keys + child_bookmark_value = get_bookmark(state, repo_path, child_object.tap_stream_id, "since", start_date, is_stream_incremental) if not parent_id: parent_id = grand_parent_id @@ -701,11 +702,10 @@ def add_fields_at_1st_level(self, record, parent_record = None): class CommitFiles(IncrementalStream): ''' - Child of "commits" - https://docs.github.com/en/rest/commits/commits#list-commits-on-a-repository + Child of "commits" - https://docs.github.com/en/rest/commits/commits#get-a-commit ''' tap_stream_id = "commit_files" replication_method = "INCREMENTAL" - replication_keys = "updated_at" key_properties = ["commit_sha", "filename"] id_keys = ["sha"] use_repository = True @@ -714,13 +714,12 @@ class CommitFiles(IncrementalStream): parent = 'commits' result_path = "files" -class CommitParents(IncrementalStream): +class CommitParents(FullTableStream): ''' Child of "commits" - https://docs.github.com/en/rest/commits/commits#list-commits-on-a-repository ''' tap_stream_id = "commit_parents" replication_method = "INCREMENTAL" - replication_keys = "updated_at" key_properties = ["children_sha","sha"] no_path = True inherit_parent_fields = [("children_sha","sha"), ("_sdc_repository","_sdc_repository")] @@ -733,7 +732,6 @@ class CommitPullRequest(IncrementalStream): ''' tap_stream_id = "commit_pull_request" replication_method = "INCREMENTAL" - replication_keys = "updated_at" key_properties = ["commit_sha","pull_request_id"] path = "commits/{}/pulls" use_repository = True @@ -791,7 +789,6 @@ class IssueAssignees(IncrementalOrderedStream): ''' tap_stream_id = "issue_assignees" replication_method = "INCREMENTAL" - replication_keys = "updated_at" key_properties = ["issue_id","id"] no_path = True inherit_parent_fields = [("issue_id","id"), ("_sdc_repository","_sdc_repository")] @@ -804,7 +801,6 @@ class IssueLabels(IncrementalOrderedStream): ''' tap_stream_id = "issue_labels" replication_method = "INCREMENTAL" - replication_keys = "updated_at" key_properties = ["issue_id","id"] no_path = True inherit_parent_fields = [("issue_id","id"), ("_sdc_repository","_sdc_repository")] @@ -1057,7 +1053,7 @@ class WorkflowRuns(IncrementalStream): path = "actions/runs" result_path = "workflow_runs" filter_param_custom = "created=>=" - children = ["workflow_pull_requests"] + children = ["workflow_run_pull_requests"] has_children = True def add_fields_at_1st_level(self, record, parent_record = None): @@ -1073,9 +1069,8 @@ class WorkflowPullRequests(IncrementalStream): ''' Child of "workflow_runs" - https://docs.github.com/en/rest/actions/workflow-runs#list-workflow-runs-for-a-repository ''' - tap_stream_id = "workflow_pull_requests" + tap_stream_id = "workflow_run_pull_requests" replication_method = "INCREMENTAL" - replication_keys = "updated_at" key_properties = ["workflow_run_id","id"] no_path = True inherit_parent_fields = [("workflow_run_id","id"), ("_sdc_repository","_sdc_repository")] @@ -1130,5 +1125,5 @@ def add_fields_at_1st_level(self, record, parent_record = None): "deployment_statuses": DeploymentStatuses, "workflows": Workflows, "workflow_runs": WorkflowRuns, - "workflow_pull_requests": WorkflowPullRequests + "workflow_run_pull_requests": WorkflowPullRequests } From c0c2c4502b32c758f6f9e22676523a995facee08 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Thu, 20 Apr 2023 15:01:22 +0900 Subject: [PATCH 31/37] fix:Increase request per_page filter and save bookmark at every call --- tap_github/streams.py | 75 +++++++++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 20 deletions(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index 9e56e59e..7f3b8b9c 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -4,6 +4,7 @@ LOGGER = singer.get_logger() DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ' +PER_PAGE_NUMBER = 100 def get_bookmark(state, repo, stream_name, bookmark_key, start_date, is_incremental = True): """ @@ -62,8 +63,9 @@ class Stream: replication_keys = None key_properties = [] path = None - filter_param = False - filter_param_custom = "" + since_filter_param = "" + since_filter_param_custom = "" + additional_filters = "" id_keys = [] use_organization = False children = [] @@ -81,12 +83,14 @@ def build_url(self, base_url, repo_path, bookmark): """ Build the full url with parameters and attributes. """ - if self.filter_param: + if self.since_filter_param: # Add the since parameter for incremental streams - query_string = '?since={}'.format(bookmark) - elif self.filter_param_custom: + query_string = '?since={}{}'.format(bookmark,self.since_filter_param) + elif self.since_filter_param_custom: # Add additional custom filter for incremental streams - query_string = f'?{self.filter_param_custom}{bookmark}' + query_string = f'?{self.since_filter_param_custom}{bookmark}' + elif self.additional_filters: + query_string = f'?{self.additional_filters}' else: query_string = '' @@ -402,6 +406,8 @@ def sync_endpoint(self, stream_to_sync, selected_stream_ids, parent_record = record) + # Write bookmark for incremental stream. + self.write_bookmarks(self.tap_stream_id, selected_stream_ids, max_bookmark_value, repo_path, state) else: LOGGER.warning("Skipping this record for %s stream with %s = %s as it is missing replication key %s.", self.tap_stream_id, self.key_properties, record[self.key_properties], self.replication_keys) @@ -492,6 +498,8 @@ def sync_endpoint(self, stream_to_sync, selected_stream_ids, parent_record = record) + # Write bookmark for incremental stream. + self.write_bookmarks(self.tap_stream_id, selected_stream_ids, bookmark_value, repo_path, state) else: LOGGER.warning("Skipping this record for %s stream with %s = %s as it is missing replication key %s.", self.tap_stream_id, self.key_properties, record[self.key_properties], self.replication_keys) @@ -512,6 +520,7 @@ class Reviews(IncrementalStream): replication_method = "INCREMENTAL" replication_keys = "submitted_at" key_properties = ["id"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "pulls/{}/reviews" use_repository = True id_keys = ['number'] @@ -525,13 +534,14 @@ def add_fields_at_1st_level(self, record, parent_record = None): class ReviewComments(IncrementalOrderedStream): ''' - https://docs.github.com/en/rest/pulls/comments#get-a-review-comment-for-a-pull-request + https://docs.github.com/en/rest/pulls/comments#list-review-comments-on-a-pull-request ''' tap_stream_id = "review_comments" replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["id"] - path = "pulls/{}/comments?sort=updated_at&direction=desc" + additional_filters = f"sort=updated_at&direction=desc&per_page{PER_PAGE_NUMBER}" + path = "pulls/{}/comments" use_repository = True id_keys = ['number'] parent = 'pull_requests' @@ -550,6 +560,7 @@ class PRCommits(IncrementalStream): replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["id"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "pulls/{}/commits" use_repository = True id_keys = ['number'] @@ -573,7 +584,8 @@ class PullRequests(IncrementalOrderedStream): replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["id"] - path = "pulls?state=all&sort=updated&direction=desc" + additional_filters = f"state=all&sort=updated&direction=desc&per_page{PER_PAGE_NUMBER}" + path = "pulls" children = ['reviews', 'review_comments', 'pr_commits'] has_children = True pk_child_fields = ["number"] @@ -586,6 +598,7 @@ class ProjectCards(IncrementalStream): replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["id"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "projects/columns/{}/cards" tap_stream_id = "project_cards" parent = 'project_columns' @@ -599,6 +612,7 @@ class ProjectColumns(IncrementalStream): replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["id"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "projects/{}/columns" children = ["project_cards"] parent = "projects" @@ -613,7 +627,8 @@ class Projects(IncrementalStream): replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["id"] - path = "projects?state=all" + additional_filters = f"state=all&per_page{PER_PAGE_NUMBER}" + path = "projects" tap_stream_id = "projects" children = ["project_columns"] has_children = True @@ -644,6 +659,7 @@ class TeamMembers(FullTableStream): tap_stream_id = "team_members" replication_method = "FULL_TABLE" key_properties = ["team_slug", "id"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "orgs/{}/teams/{}/members" use_organization = True id_keys = ['slug'] @@ -666,6 +682,7 @@ class Teams(FullTableStream): tap_stream_id = "teams" replication_method = "FULL_TABLE" key_properties = ["id"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "orgs/{}/teams" use_organization = True children = ["team_members"] @@ -683,7 +700,7 @@ class Commits(IncrementalStream): path = "commits" children= ["commit_users_emails", "commit_files", "commit_parents", "commit_pull_request"] has_children = True - filter_param = True + since_filter_param = f"&per_page={PER_PAGE_NUMBER}" def add_fields_at_1st_level(self, record, parent_record = None): """ @@ -733,6 +750,7 @@ class CommitPullRequest(IncrementalStream): tap_stream_id = "commit_pull_request" replication_method = "INCREMENTAL" key_properties = ["commit_sha","pull_request_id"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "commits/{}/pulls" use_repository = True id_keys = ["sha"] @@ -767,8 +785,8 @@ class Comments(IncrementalOrderedStream): replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["id"] - filter_param = True - path = "issues/comments?sort=updated&direction=desc" + since_filter_param = f"&sort=updated&direction=desc&per_page={PER_PAGE_NUMBER}" + path = "issues/comments" class Issues(IncrementalOrderedStream): ''' @@ -778,8 +796,8 @@ class Issues(IncrementalOrderedStream): replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["id"] - filter_param = True - path = "issues?state=all&sort=updated&direction=desc" + since_filter_param = f"&state=all&sort=updated&direction=desc&per_page={PER_PAGE_NUMBER}" + path = "issues" children = ["issue_assignees","issue_labels"] has_children = True @@ -814,6 +832,7 @@ class Assignees(FullTableStream): tap_stream_id = "assignees" replication_method = "FULL_TABLE" key_properties = ["id"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "assignees" class Releases(FullTableStream): @@ -823,7 +842,8 @@ class Releases(FullTableStream): tap_stream_id = "releases" replication_method = "FULL_TABLE" key_properties = ["id"] - path = "releases?sort=created_at&direction=desc" + additional_filters = f"sort=created_at&direction=desc&per_page{PER_PAGE_NUMBER}" + path = "releases" children = ["release_assets"] has_children = True @@ -838,6 +858,7 @@ class ReleaseAssets(FullTableStream): id_keys = ["id"] no_path = True inherit_parent_fields = [("release_id","id"), ("_sdc_repository","_sdc_repository")] + additional_filters = f"per_page{PER_PAGE_NUMBER}" inherit_array_parent_fields = "assets" parent = 'releases' @@ -855,6 +876,7 @@ class Branches(FullTableStream): tap_stream_id = "branches" replication_method = "FULL_TABLE" key_properties = ["name"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "branches" def add_fields_at_1st_level(self, record, parent_record = None): @@ -870,6 +892,7 @@ class Labels(FullTableStream): tap_stream_id = "labels" replication_method = "FULL_TABLE" key_properties = ["id"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "labels" class IssueEvents(IncrementalOrderedStream): @@ -880,7 +903,8 @@ class IssueEvents(IncrementalOrderedStream): replication_method = "INCREMENTAL" replication_keys = "created_at" key_properties = ["id"] - path = "issues/events?sort=created_at&direction=desc" + additional_filters = f"sort=created_at&direction=desc&per_page{PER_PAGE_NUMBER}" + path = "issues/events" class Events(IncrementalStream): ''' @@ -890,6 +914,7 @@ class Events(IncrementalStream): replication_method = "INCREMENTAL" replication_keys = "created_at" key_properties = ["id"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "events" class CommitComments(IncrementalStream): @@ -900,6 +925,7 @@ class CommitComments(IncrementalStream): replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["id"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "comments" class IssueMilestones(IncrementalOrderedStream): @@ -910,7 +936,8 @@ class IssueMilestones(IncrementalOrderedStream): replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["id"] - path = "milestones?direction=desc&sort=updated_at" + additional_filters = f"direction=desc&sort=updated_at&per_page{PER_PAGE_NUMBER}" + path = "milestones" class Collaborators(FullTableStream): ''' @@ -919,6 +946,7 @@ class Collaborators(FullTableStream): tap_stream_id = "collaborators" replication_method = "FULL_TABLE" key_properties = ["id"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "collaborators" children = ["collaborator_details"] has_children = True @@ -931,6 +959,7 @@ class CollaboratorDetails(FullTableStream): replication_method = "FULL_TABLE" key_properties = ["id"] id_keys = ["login"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "users/{}" parent = 'collaborators' @@ -942,6 +971,7 @@ class StarGazers(FullTableStream): tap_stream_id = "stargazers" replication_method = "FULL_TABLE" key_properties = ["user_id"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "stargazers" headers = {'Accept': 'application/vnd.github.v3.star+json'} @@ -959,6 +989,7 @@ class Repositories(FullTableStream): replication_method = "FULL_TABLE" key_properties = ["id"] use_organization = True + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "orgs/{}/repos" children = ["repository_topics"] has_children = True @@ -977,6 +1008,7 @@ class RepositoryTeams(FullTableStream): tap_stream_id = "repository_teams" replication_method = "FULL_TABLE" key_properties = ["_sdc_repository","id"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "teams" class RepositoryTopics(FullTableStream): @@ -1000,7 +1032,8 @@ class Deployments(FullTableStream): tap_stream_id = "deployments" replication_method = "FULL_TABLE" key_properties = ["id"] - path = "deployments?sort=created_at&direction=desc" + additional_filters = f"sort=created_at&direction=desc&per_page{PER_PAGE_NUMBER}" + path = "deployments" children = ["deployment_statuses"] has_children = True @@ -1019,6 +1052,7 @@ class DeploymentStatuses(FullTableStream): replication_method = "FULL_TABLE" use_repository = True key_properties = ["deployment_id","id"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "deployments/{}/statuses" id_keys = ["id"] inherit_parent_fields = [("deployment_id","id"),("_sdc_repository","_sdc_repository")] @@ -1038,6 +1072,7 @@ class Workflows(FullTableStream): tap_stream_id = "workflows" replication_method = "FULL_TABLE" key_properties = ["id"] + additional_filters = f"per_page{PER_PAGE_NUMBER}" path = "actions/workflows" result_path = "workflows" @@ -1052,7 +1087,7 @@ class WorkflowRuns(IncrementalStream): key_properties = ["id"] path = "actions/runs" result_path = "workflow_runs" - filter_param_custom = "created=>=" + since_filter_param_custom = f"per_page={PER_PAGE_NUMBER}&created=>=" children = ["workflow_run_pull_requests"] has_children = True From a03e191f62b0dc0e1bee26990898e2fb6f04e58e Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Thu, 20 Apr 2023 15:02:56 +0900 Subject: [PATCH 32/37] fix:Stop sending same schema multiple times --- tap_github/sync.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tap_github/sync.py b/tap_github/sync.py index 65d8734d..1d33cb64 100644 --- a/tap_github/sync.py +++ b/tap_github/sync.py @@ -5,6 +5,7 @@ LOGGER = singer.get_logger() STREAM_TO_SYNC_FOR_ORGS = ['teams', 'team_members', 'team_memberships', 'repositories', 'repository_topics'] +schemas_sent = [] def get_selected_streams(catalog): ''' @@ -47,6 +48,7 @@ def get_ordered_stream_list(currently_syncing, streams_to_sync): """ Get an ordered list of remaining streams to sync other streams followed by synced streams. """ + LOGGER.info(f'Currently syncing stream: {currently_syncing}') stream_list = list(sorted(streams_to_sync)) if currently_syncing in stream_list: index = stream_list.index(currently_syncing) @@ -58,6 +60,7 @@ def get_ordered_repos(state, repositories): Get an ordered list of remaining repos to sync followed by synced repos. """ syncing_repo = state.get("currently_syncing_repo") + LOGGER.info(f'Currently syncing repo from state: {syncing_repo}') if syncing_repo in repositories: index = repositories.index(syncing_repo) repositories = repositories[index:] + repositories[:index] @@ -163,7 +166,9 @@ def write_schemas(stream_id, catalog, selected_streams): if stream_id in selected_streams: # Get catalog object for particular stream. stream = [cat for cat in catalog['streams'] if cat['tap_stream_id'] == stream_id ][0] - singer.write_schema(stream_id, stream['schema'], stream['key_properties']) + if stream_id not in schemas_sent: + singer.write_schema(stream_id, stream['schema'], stream['key_properties']) + schemas_sent.append(stream_id) for child in stream_obj.children: write_schemas(child, catalog, selected_streams) From 0749029a40a3bc7445e4e1cfa4f5be0710de7a3d Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Tue, 25 Apr 2023 10:34:28 +0900 Subject: [PATCH 33/37] fix:Reduce request size for commits table --- tap_github/streams.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index 7f3b8b9c..6bf53be2 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -700,7 +700,7 @@ class Commits(IncrementalStream): path = "commits" children= ["commit_users_emails", "commit_files", "commit_parents", "commit_pull_request"] has_children = True - since_filter_param = f"&per_page={PER_PAGE_NUMBER}" + since_filter_param = f"&per_page=30" def add_fields_at_1st_level(self, record, parent_record = None): """ From 4c328327f33e397fba4ca57ed7b3384f08c62eff Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Wed, 26 Apr 2023 10:25:39 +0900 Subject: [PATCH 34/37] fix:Introduce IncrementalDateStream for tables to fetch by date --- tap_github/client.py | 1 + tap_github/streams.py | 131 ++++++++++++++++++++++++++++++++++++++---- tap_github/sync.py | 9 +-- 3 files changed, 125 insertions(+), 16 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index d4f32a60..28b05c93 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -223,6 +223,7 @@ def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_4 # Fetch the next page if next found in the response. if 'next' in r.links: url = r.links['next']['url'] + LOGGER.info(f'Found a next link: {url}') else: # Break the loop if all pages are fetched. break diff --git a/tap_github/streams.py b/tap_github/streams.py index 6bf53be2..e77c3d5c 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -1,10 +1,11 @@ -from datetime import datetime +from datetime import datetime, timedelta import singer from singer import (metrics, bookmarks, metadata) LOGGER = singer.get_logger() DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ' PER_PAGE_NUMBER = 100 +DATE_RANGE_WINDOW = 7 def get_bookmark(state, repo, stream_name, bookmark_key, start_date, is_incremental = True): """ @@ -16,6 +17,18 @@ def get_bookmark(state, repo, stream_name, bookmark_key, start_date, is_incremen return repo_stream_dict.get(bookmark_key) return start_date +def get_date_ranges(start_date, end_date, date_range_window=DATE_RANGE_WINDOW): + """ + Return a list of date ranges to be used for the API calls. + """ + start_date = datetime.strptime(start_date, DATE_FORMAT) + end_date = datetime.strptime(end_date, DATE_FORMAT) + while start_date < end_date: + temp_end_date=start_date + timedelta(days=date_range_window) + date_ranges=(start_date.strftime(DATE_FORMAT),temp_end_date.strftime(DATE_FORMAT)) + start_date = temp_end_date + yield date_ranges + def get_schema(catalog, stream_id): """ Return catalog of the specified stream. @@ -88,7 +101,7 @@ def build_url(self, base_url, repo_path, bookmark): query_string = '?since={}{}'.format(bookmark,self.since_filter_param) elif self.since_filter_param_custom: # Add additional custom filter for incremental streams - query_string = f'?{self.since_filter_param_custom}{bookmark}' + query_string = f'?{self.since_filter_param_custom}'.format(**bookmark) elif self.additional_filters: query_string = f'?{self.additional_filters}' else: @@ -273,7 +286,8 @@ def sync_endpoint(self, repo_path, start_date, selected_stream_ids, - stream_to_sync + stream_to_sync, + config, ): """ A common function sync full table streams. @@ -336,7 +350,8 @@ def sync_endpoint(self, repo_path, start_date, selected_stream_ids, - stream_to_sync + stream_to_sync, + config, ): """ @@ -406,8 +421,6 @@ def sync_endpoint(self, stream_to_sync, selected_stream_ids, parent_record = record) - # Write bookmark for incremental stream. - self.write_bookmarks(self.tap_stream_id, selected_stream_ids, max_bookmark_value, repo_path, state) else: LOGGER.warning("Skipping this record for %s stream with %s = %s as it is missing replication key %s.", self.tap_stream_id, self.key_properties, record[self.key_properties], self.replication_keys) @@ -416,6 +429,98 @@ def sync_endpoint(self, self.write_bookmarks(self.tap_stream_id, selected_stream_ids, max_bookmark_value, repo_path, state) return state + +class IncrementalDateStream(Stream): + def sync_endpoint(self, + client, + state, + catalog, + repo_path, + start_date, + selected_stream_ids, + stream_to_sync, + config, + ): + + """ + A common function sync incremental streams. Sync an incremental stream for which records are not + in descending order. For, incremental streams iterate all records, write only newly updated records and + write the latest bookmark value. + """ + + parent_bookmark_value = get_bookmark(state, repo_path, self.tap_stream_id, "since", start_date) + current_time = datetime.today().strftime(DATE_FORMAT) + min_bookmark_value = self.get_min_bookmark(self.tap_stream_id, selected_stream_ids, current_time, repo_path, start_date, state) + + max_bookmark_value = min_bookmark_value + LOGGER.info(f'Starting stream with bookmark {min_bookmark_value} and current time {current_time}') + for start_date, end_date in get_date_ranges(min_bookmark_value, current_time, config.get('date_range_window', DATE_RANGE_WINDOW)): + # build full url + full_url = self.build_url(client.base_url, repo_path, {'from': start_date, 'until': end_date}) + + stream_catalog = get_schema(catalog, self.tap_stream_id) + + with metrics.record_counter(self.tap_stream_id) as counter: + for response in client.authed_get_all_pages( + self.tap_stream_id, + full_url, + self.headers, + stream = self.tap_stream_id + ): + records = response.json() + if self.result_path: records = records.get(self.result_path,[]) + extraction_time = singer.utils.now() + # Loop through all records + for record in records: + record['_sdc_repository'] = repo_path + self.add_fields_at_1st_level(record = record, parent_record = None) + + with singer.Transformer() as transformer: + if record.get(self.replication_keys): + if record[self.replication_keys] >= max_bookmark_value: + # Update max_bookmark_value + max_bookmark_value = record[self.replication_keys] + + bookmark_dttm = record[self.replication_keys] + + # Keep only records whose bookmark is after the last_datetime + if bookmark_dttm >= min_bookmark_value: + if self.tap_stream_id in selected_stream_ids and bookmark_dttm >= parent_bookmark_value: + rec = transformer.transform(record, stream_catalog['schema'], metadata=metadata.to_map(stream_catalog['metadata'])) + + singer.write_record(self.tap_stream_id, rec, time_extracted=extraction_time) + counter.increment() + + for child in self.children: + if child in stream_to_sync: + + parent_id = tuple(record.get(key) for key in STREAMS[child]().id_keys) + if STREAMS[child]().id_keys and not all(parent_id): + pass + else: + # Sync child stream, if it is selected or its nested child is selected. + self.get_child_records(client, + catalog, + child, + parent_id, + repo_path, + state, + start_date, + record.get(self.replication_keys), + stream_to_sync, + selected_stream_ids, + parent_record = record) + # Write bookmark for incremental stream. + self.write_bookmarks(self.tap_stream_id, selected_stream_ids, max_bookmark_value, repo_path, state) + else: + LOGGER.warning("Skipping this record for %s stream with %s = %s as it is missing replication key %s.", + self.tap_stream_id, self.key_properties, record[self.key_properties], self.replication_keys) + + # Write bookmark for incremental stream. + self.write_bookmarks(self.tap_stream_id, selected_stream_ids, max_bookmark_value, repo_path, state) + singer.write_state(state) + + return state class IncrementalOrderedStream(Stream): @@ -426,7 +531,8 @@ def sync_endpoint(self, repo_path, start_date, selected_stream_ids, - stream_to_sync + stream_to_sync, + config, ): """ A sync function for streams that have records in the descending order of replication key value. For such streams, @@ -485,6 +591,7 @@ def sync_endpoint(self, for child in self.children: if child in stream_to_sync: parent_id = tuple(record.get(key) for key in STREAMS[child]().id_keys) + LOGGER.info(f"Syncing child {child}") # Sync child stream, if it is selected or its nested child is selected. self.get_child_records(client, @@ -689,7 +796,7 @@ class Teams(FullTableStream): has_children = True pk_child_fields = ['slug'] -class Commits(IncrementalStream): +class Commits(IncrementalDateStream): ''' https://docs.github.com/en/rest/commits/commits#list-commits-on-a-repository ''' @@ -700,7 +807,7 @@ class Commits(IncrementalStream): path = "commits" children= ["commit_users_emails", "commit_files", "commit_parents", "commit_pull_request"] has_children = True - since_filter_param = f"&per_page=30" + since_filter_param_custom = "since={from}&until={until}&per_page=30" def add_fields_at_1st_level(self, record, parent_record = None): """ @@ -1076,18 +1183,18 @@ class Workflows(FullTableStream): path = "actions/workflows" result_path = "workflows" -class WorkflowRuns(IncrementalStream): +class WorkflowRuns(IncrementalDateStream): ''' https://docs.github.com/en/rest/actions/workflow-runs#list-workflow-runs-for-a-repository ''' tap_stream_id = "workflow_runs" replication_method = "INCREMENTAL" - replication_keys = "updated_at" + replication_keys = "created_at" use_repository = True key_properties = ["id"] path = "actions/runs" result_path = "workflow_runs" - since_filter_param_custom = f"per_page={PER_PAGE_NUMBER}&created=>=" + since_filter_param_custom = "per_page=100&created={from}..{until}" children = ["workflow_run_pull_requests"] has_children = True diff --git a/tap_github/sync.py b/tap_github/sync.py index 1d33cb64..e9dde313 100644 --- a/tap_github/sync.py +++ b/tap_github/sync.py @@ -206,7 +206,7 @@ def sync(client, config, state, catalog): for repo in get_ordered_repos(state, repositories): update_currently_syncing_repo(state, repo) LOGGER.info("Starting sync of repository: %s", repo) - do_sync(catalog, streams_to_sync_for_repos, selected_stream_ids, client, start_date, state, repo) + do_sync(catalog, streams_to_sync_for_repos, selected_stream_ids, client, start_date, state, repo, config) if client.not_accessible_repos: # Give warning messages for a repo that is not accessible by a stream or is invalid. @@ -215,14 +215,14 @@ def sync(client, config, state, catalog): client.not_accessible_repos = set() update_currently_syncing_repo(state, None) -def do_sync(catalog, streams_to_sync, selected_stream_ids, client, start_date, state, repo): +def do_sync(catalog, streams_to_sync, selected_stream_ids, client, start_date, state, repo, config= {}): """ Sync all other streams except teams, team_members and team_memberships for each repo. """ currently_syncing = singer.get_currently_syncing(state) for stream_id in get_ordered_stream_list(currently_syncing, streams_to_sync): stream_obj = STREAMS[stream_id]() - + LOGGER.info(f'Starting stream {stream_id} for {repo}.') # If it is a "sub_stream", it will be synced as part of the parent stream if stream_id in streams_to_sync and not stream_obj.parent: write_schemas(stream_id, catalog, selected_stream_ids) @@ -234,7 +234,8 @@ def do_sync(catalog, streams_to_sync, selected_stream_ids, client, start_date, s repo_path = repo, start_date = start_date, selected_stream_ids = selected_stream_ids, - stream_to_sync = streams_to_sync + stream_to_sync = streams_to_sync, + config = config, ) singer.write_state(state) From 8c8ed59289eb0244561c74368198c359e3de8af6 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Wed, 26 Apr 2023 13:22:44 +0900 Subject: [PATCH 35/37] fix:Change ordered stream order to ascending --- tap_github/streams.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index e77c3d5c..820163c7 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -510,8 +510,8 @@ def sync_endpoint(self, stream_to_sync, selected_stream_ids, parent_record = record) - # Write bookmark for incremental stream. - self.write_bookmarks(self.tap_stream_id, selected_stream_ids, max_bookmark_value, repo_path, state) + # Write bookmark for incremental stream. + self.write_bookmarks(self.tap_stream_id, selected_stream_ids, max_bookmark_value, repo_path, state) else: LOGGER.warning("Skipping this record for %s stream with %s = %s as it is missing replication key %s.", self.tap_stream_id, self.key_properties, record[self.key_properties], self.replication_keys) @@ -647,7 +647,7 @@ class ReviewComments(IncrementalOrderedStream): replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["id"] - additional_filters = f"sort=updated_at&direction=desc&per_page{PER_PAGE_NUMBER}" + additional_filters = f"sort=updated_at&direction=asc&per_page{PER_PAGE_NUMBER}" path = "pulls/{}/comments" use_repository = True id_keys = ['number'] @@ -691,7 +691,7 @@ class PullRequests(IncrementalOrderedStream): replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["id"] - additional_filters = f"state=all&sort=updated&direction=desc&per_page{PER_PAGE_NUMBER}" + additional_filters = f"state=all&sort=updated&direction=asc&per_page{PER_PAGE_NUMBER}" path = "pulls" children = ['reviews', 'review_comments', 'pr_commits'] has_children = True @@ -892,7 +892,7 @@ class Comments(IncrementalOrderedStream): replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["id"] - since_filter_param = f"&sort=updated&direction=desc&per_page={PER_PAGE_NUMBER}" + since_filter_param = f"&sort=updated&direction=asc&per_page={PER_PAGE_NUMBER}" path = "issues/comments" class Issues(IncrementalOrderedStream): @@ -903,7 +903,7 @@ class Issues(IncrementalOrderedStream): replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["id"] - since_filter_param = f"&state=all&sort=updated&direction=desc&per_page={PER_PAGE_NUMBER}" + since_filter_param = f"&state=all&sort=updated&direction=asc&per_page={PER_PAGE_NUMBER}" path = "issues" children = ["issue_assignees","issue_labels"] has_children = True From 77529776aaa7cb288931b26386a4fda92c181f11 Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Thu, 27 Apr 2023 10:31:09 +0900 Subject: [PATCH 36/37] fix:Add bookmark for IncrementalDateStream --- tap_github/streams.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index 820163c7..0d1a46d9 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -515,7 +515,7 @@ def sync_endpoint(self, else: LOGGER.warning("Skipping this record for %s stream with %s = %s as it is missing replication key %s.", self.tap_stream_id, self.key_properties, record[self.key_properties], self.replication_keys) - + if max_bookmark_value < start_date: max_bookmark_value = start_date # Write bookmark for incremental stream. self.write_bookmarks(self.tap_stream_id, selected_stream_ids, max_bookmark_value, repo_path, state) singer.write_state(state) From 1ae42842239d048aea9488cae60c71200d50f8cf Mon Sep 17 00:00:00 2001 From: Ken Mishima Date: Wed, 5 Jul 2023 15:20:26 +0900 Subject: [PATCH 37/37] fix:Not skip 404 messages on validation --- tap_github/client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 9913a8c2..000898e0 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -219,12 +219,12 @@ def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_4 # Break the loop if all pages are fetched. break - def verify_repo_access(self, url_for_repo, repo): + def verify_repo_access(self, url_for_repo, repo, should_skip_404 = True): """ Call rest API to verify that the user has sufficient permissions to access this repository. """ try: - self.authed_get("verifying repository access", url_for_repo) + self.authed_get("verifying repository access", url_for_repo, should_skip_404 = should_skip_404) except NotFoundException: # Throwing user-friendly error message as it checks token access message = "HTTP-error-code: 404, Error: Please check the repository name \'{}\' or you do not have sufficient permissions to access this repository.".format(repo) @@ -242,7 +242,7 @@ def verify_access_for_repo(self): LOGGER.info("Verifying access of repository: %s", repo) # Verifying for Repo access - self.verify_repo_access(url_for_repo, repo) + self.verify_repo_access(url_for_repo, repo, False) def extract_orgs_from_config(self): """