|
19 | 19 | DoubleType, |
20 | 20 | DecimalType, |
21 | 21 | BinaryType, |
22 | | - FixedType |
| 22 | + FixedType, |
23 | 23 | ) |
24 | 24 | from pyiceberg.partitioning import PartitionSpec, PartitionField |
25 | 25 | from pyiceberg.transforms import DayTransform |
|
178 | 178 | pandas_df["tm"] = pd.to_datetime(pandas_df["tm"]).dt.time |
179 | 179 | pandas_df["ts"] = pd.to_datetime(pandas_df["ts"]) |
180 | 180 | pandas_df["dt"] = pd.to_datetime(pandas_df["dt"]).dt.date |
181 | | - pandas_df['dec'] = pandas_df["dec"].apply(lambda x: Decimal(f"{x:.3f}")) |
| 181 | + pandas_df["dec"] = pandas_df["dec"].apply(lambda x: Decimal(f"{x:.3f}")) |
182 | 182 | # pandas_df['uuid'] = pandas_df['uuid'].apply(lambda x: bytes(x)) |
183 | | - pandas_df['fixed'] = pandas_df['fixed'].apply(lambda x: bytes(x)) |
184 | | - pandas_df['varbin'] = pandas_df['varbin'].apply(lambda x: bytes(x)) |
| 183 | + pandas_df["fixed"] = pandas_df["fixed"].apply(lambda x: bytes(x)) |
| 184 | + pandas_df["varbin"] = pandas_df["varbin"].apply(lambda x: bytes(x)) |
185 | 185 |
|
186 | 186 | else: |
187 | 187 | # Generate a range of dates between 2024-01-01 and 2024-12-31 |
188 | | - date_range = pd.date_range(start='2024-01-01', end='2024-12-31') |
| 188 | + date_range = pd.date_range(start="2024-01-01", end="2024-12-31") |
189 | 189 |
|
190 | 190 | data = { |
191 | 191 | "b": np.random.choice([True, False], size=num_records), # Boolean |
192 | 192 | "i": np.arange(1, num_records + 1, dtype=np.int32), |
193 | | - "l": np.random.randint(np.iinfo(np.int64).min, np.iinfo(np.int64).max, size=num_records, dtype=np.int64), # int64 |
194 | | - "r": np.random.uniform(-1e6, 1e6, size=num_records).astype(np.float32), # float32 |
195 | | - "d": np.random.uniform(-1e12, 1e12, size=num_records).astype(np.float64), # float64 |
196 | | - "dec": [Decimal(random.uniform(-1e5, 1e5)).quantize(Decimal("0.001")) for _ in range(num_records)], |
197 | | - #"dt": pd.date_range(start="2000-01-01", periods=num_records, freq="D").date, # date32 |
| 193 | + "l": np.random.randint( |
| 194 | + np.iinfo(np.int64).min, |
| 195 | + np.iinfo(np.int64).max, |
| 196 | + size=num_records, |
| 197 | + dtype=np.int64, |
| 198 | + ), # int64 |
| 199 | + "r": np.random.uniform(-1e6, 1e6, size=num_records).astype( |
| 200 | + np.float32 |
| 201 | + ), # float32 |
| 202 | + "d": np.random.uniform(-1e12, 1e12, size=num_records).astype( |
| 203 | + np.float64 |
| 204 | + ), # float64 |
| 205 | + "dec": [ |
| 206 | + Decimal(random.uniform(-1e5, 1e5)).quantize(Decimal("0.001")) |
| 207 | + for _ in range(num_records) |
| 208 | + ], |
| 209 | + # "dt": pd.date_range(start="2000-01-01", periods=num_records, freq="D").date, # date32 |
198 | 210 | "dt": [date_range[i % len(date_range)] for i in range(num_records)], |
199 | | - "tm": [time(random.randint(0, 23), random.randint(0, 59), random.randint(0, 59), random.randint(0, 999999)) for _ in range(num_records)], |
| 211 | + "tm": [ |
| 212 | + time( |
| 213 | + random.randint(0, 23), |
| 214 | + random.randint(0, 59), |
| 215 | + random.randint(0, 59), |
| 216 | + random.randint(0, 999999), |
| 217 | + ) |
| 218 | + for _ in range(num_records) |
| 219 | + ], |
200 | 220 | "ts": [ |
201 | 221 | datetime.datetime(2023, 1, 1) + datetime.timedelta(seconds=i) |
202 | 222 | for i in range(num_records) |
203 | 223 | ], |
204 | 224 | "s": [f"string_{i}" for i in range(num_records)], # string |
205 | 225 | # "uuid": [uuid.uuid4().bytes for _ in range(num_records)], # binary(16) - UUID |
206 | 226 | "fixed": [np.random.bytes(5) for _ in range(num_records)], # fixed binary(5) |
207 | | - "varbin": [np.random.bytes(np.random.randint(1, 20)) for _ in range(num_records)] # variable-length binary |
| 227 | + "varbin": [ |
| 228 | + np.random.bytes(np.random.randint(1, 20)) for _ in range(num_records) |
| 229 | + ], # variable-length binary |
208 | 230 | } |
209 | 231 |
|
210 | 232 | # Create the DataFrame |
211 | 233 | pandas_df = pd.DataFrame(data) |
212 | 234 |
|
213 | | - #print(pandas_df.head()) |
| 235 | + # print(pandas_df.head()) |
214 | 236 |
|
215 | 237 | print("Generating Pandas dataframe") |
216 | 238 |
|
|
0 commit comments