forked from singer-io/tap-github
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsync.py
More file actions
242 lines (209 loc) · 10.2 KB
/
sync.py
File metadata and controls
242 lines (209 loc) · 10.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import collections
import singer
from singer import bookmarks
from tap_github.streams import STREAMS
LOGGER = singer.get_logger()
STREAM_TO_SYNC_FOR_ORGS = ['teams', 'team_members', 'team_memberships', 'repositories', 'repository_topics']
schemas_sent = []
def get_selected_streams(catalog):
'''
Gets selected streams. Checks schema's 'selected'
first -- and then checks metadata, looking for an empty
breadcrumb and mdata with a 'selected' entry
'''
selected_streams = []
for stream in catalog['streams']:
stream_metadata = stream['metadata']
for entry in stream_metadata:
# Stream metadata will have an empty breadcrumb
if not entry['breadcrumb'] and entry['metadata'].get('selected',None):
selected_streams.append(stream['tap_stream_id'])
return selected_streams
def update_currently_syncing(state, stream_name):
"""
Updates currently syncing stream in the state.
"""
if not stream_name and singer.get_currently_syncing(state):
del state['currently_syncing']
else:
singer.set_currently_syncing(state, stream_name)
singer.write_state(state)
def update_currently_syncing_repo(state, repo_path):
"""
Updates currently syncing repository in the state.
and flushes `currently_syncing_repo` when all repositories are synced.
"""
if (not repo_path) and ('currently_syncing_repo' in state):
del state['currently_syncing_repo']
else:
state['currently_syncing_repo'] = repo_path
singer.write_state(state)
def get_ordered_stream_list(currently_syncing, streams_to_sync):
"""
Get an ordered list of remaining streams to sync other streams followed by synced streams.
"""
LOGGER.info(f'Currently syncing stream: {currently_syncing}')
stream_list = list(sorted(streams_to_sync))
if currently_syncing in stream_list:
index = stream_list.index(currently_syncing)
stream_list = stream_list[index:] + stream_list[:index]
return stream_list
def get_ordered_repos(state, repositories):
"""
Get an ordered list of remaining repos to sync followed by synced repos.
"""
syncing_repo = state.get("currently_syncing_repo")
LOGGER.info(f'Currently syncing repo from state: {syncing_repo}')
if syncing_repo in repositories:
index = repositories.index(syncing_repo)
repositories = repositories[index:] + repositories[:index]
return repositories
def translate_state(state, catalog, repositories):
'''
This tap used to only support a single repository, in which case the
the state took the shape of:
{
"bookmarks": {
"commits": {
"since": "2018-11-14T13:21:20.700360Z"
}
}
}
The tap now supports multiple repos, so this function should be called
at the beginning of each run to ensure the state is translated to the
new format:
{
"bookmarks": {
"singer-io/tap-adwords": {
"commits": {
"since": "2018-11-14T13:21:20.700360Z"
}
}
"singer-io/tap-salesforce": {
"commits": {
"since": "2018-11-14T13:21:20.700360Z"
}
}
}
}
'''
nested_dict = lambda: collections.defaultdict(nested_dict)
new_state = nested_dict()
# Collect keys(repo_name for update state or stream_name for older state) from state available in the `bookmarks``
previous_state_keys = state.get('bookmarks', {}).keys()
# Collect stream names from the catalog
stream_names = [stream['tap_stream_id'] for stream in catalog['streams']]
for key in previous_state_keys:
# Loop through each key of `bookmarks` available in the previous state.
# Case 1:
# Older connections `bookmarks` contain stream names so check if it is the stream name or not.
# If the previous state's key is found in the stream name list then continue to check other keys. Because we want
# to migrate each stream's bookmark into the repo name as mentioned below:
# Example: {`bookmarks`: {`stream_a`: `bookmark_a`}} to {`bookmarks`: {`repo_a`: {`stream_a`: `bookmark_a`}}}
# Case 2:
# Check if the key is available in the list of currently selected repo's list or not. Newer format `bookmarks` contain repo names.
# Return the state if the previous state's key is not found in the repo name list or stream name list.
# If the state contains a bookmark for `repo_a` and `repo_b` and the user deselects these both repos and adds another repo
# then in that case this function was returning an empty state. Now this change will return the existing state instead of the empty state.
if key not in stream_names and key not in repositories:
# Return the existing state if all repos from the previous state are deselected(not found) in the current sync.
return state
for stream in catalog['streams']:
stream_name = stream['tap_stream_id']
for repo in repositories:
if bookmarks.get_bookmark(state, repo, stream_name):
return state
if bookmarks.get_bookmark(state, stream_name, 'since'):
new_state['bookmarks'][repo][stream_name]['since'] = bookmarks.get_bookmark(state, stream_name, 'since')
return new_state
def get_stream_to_sync(catalog):
"""
Get the streams for which the sync function should be called(the parent in case of selected child streams).
"""
streams_to_sync = []
selected_streams = get_selected_streams(catalog)
for stream_name, stream_obj in STREAMS.items():
if stream_name in selected_streams or is_any_child_selected(stream_obj, selected_streams):
# Append the selected stream or deselected parent stream into the list, if its child or nested child is selected.
streams_to_sync.append(stream_name)
return streams_to_sync
def is_any_child_selected(stream_obj,selected_streams):
"""
Check if any of the child streams is selected for the parent.
"""
if stream_obj.children:
for child in stream_obj.children:
if child in selected_streams:
return True
if STREAMS[child].children:
return is_any_child_selected(STREAMS[child], selected_streams)
return False
def write_schemas(stream_id, catalog, selected_streams):
"""
Write the schemas for each stream.
"""
stream_obj = STREAMS[stream_id]()
if stream_id in selected_streams:
# Get catalog object for particular stream.
stream = [cat for cat in catalog['streams'] if cat['tap_stream_id'] == stream_id ][0]
if stream_id not in schemas_sent:
singer.write_schema(stream_id, stream['schema'], stream['key_properties'])
schemas_sent.append(stream_id)
for child in stream_obj.children:
write_schemas(child, catalog, selected_streams)
def sync(client, config, state, catalog):
"""
Sync selected streams.
"""
start_date = config['start_date']
# Get selected streams, make sure stream dependencies are met
selected_stream_ids = get_selected_streams(catalog)
streams_to_sync = get_stream_to_sync(catalog)
LOGGER.info('Sync stream %s', streams_to_sync)
repositories, organizations = client.extract_repos_from_config()
state = translate_state(state, catalog, repositories)
singer.write_state(state)
# Sync `teams`, `team_members`and `team_memberships` streams just single time for any organization.
streams_to_sync_for_orgs = set(streams_to_sync).intersection(STREAM_TO_SYNC_FOR_ORGS)
# Loop through all organizations
if selected_stream_ids:
for orgs in organizations:
LOGGER.info("Starting sync of organization: %s", orgs)
do_sync(catalog, streams_to_sync_for_orgs, selected_stream_ids, client, start_date, state, orgs)
# Sync other streams for all repos
streams_to_sync_for_repos = set(streams_to_sync) - streams_to_sync_for_orgs
# pylint: disable=too-many-nested-blocks
# Sync repositories only if any streams are selected
for repo in get_ordered_repos(state, repositories):
update_currently_syncing_repo(state, repo)
LOGGER.info("Starting sync of repository: %s", repo)
do_sync(catalog, streams_to_sync_for_repos, selected_stream_ids, client, start_date, state, repo, config)
if client.not_accessible_repos:
# Give warning messages for a repo that is not accessible by a stream or is invalid.
message = "Please check the repository name \'{}\' or you do not have sufficient permissions to access this repository for following streams {}.".format(repo, ", ".join(client.not_accessible_repos))
LOGGER.warning(message)
client.not_accessible_repos = set()
update_currently_syncing_repo(state, None)
def do_sync(catalog, streams_to_sync, selected_stream_ids, client, start_date, state, repo, config= {}):
"""
Sync all other streams except teams, team_members and team_memberships for each repo.
"""
currently_syncing = singer.get_currently_syncing(state)
for stream_id in get_ordered_stream_list(currently_syncing, streams_to_sync):
stream_obj = STREAMS[stream_id]()
LOGGER.info(f'Starting stream {stream_id} for {repo}.')
# If it is a "sub_stream", it will be synced as part of the parent stream
if stream_id in streams_to_sync and not stream_obj.parent:
write_schemas(stream_id, catalog, selected_stream_ids)
update_currently_syncing(state, stream_id)
state = stream_obj.sync_endpoint(client = client,
state = state,
catalog = catalog['streams'],
repo_path = repo,
start_date = start_date,
selected_stream_ids = selected_stream_ids,
stream_to_sync = streams_to_sync,
config = config,
)
singer.write_state(state)
update_currently_syncing(state, None)