PocketFlow-Tutorial-Codebase-Knowledge/nodes.py at main · tumivn/PocketFlow-Tutorial-Codebase-Knowledge

History

1029 lines (839 loc) · 53.7 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

import os

import yaml

from pocketflow import Node, BatchNode

from utils.crawl_github_files import crawl_github_files

from utils.call_llm import call_llm

from utils.crawl_local_files import crawl_local_files

# Helper to get content for specific file indices

def get_content_for_indices(files_data, indices):

content_map = {}

for i in indices:

if 0 <= i < len(files_data):

path, content = files_data[i]

content_map[f"{i} # {path}"] = content # Use index + path as key for context

return content_map

class FetchRepo(Node):

def prep(self, shared):

repo_url = shared.get("repo_url")

local_dir = shared.get("local_dir")

project_name = shared.get("project_name")

if not project_name:

# Basic name derivation from URL or directory

if repo_url:

project_name = repo_url.split('/')[-1].replace('.git', '')

else:

project_name = os.path.basename(os.path.abspath(local_dir))

shared["project_name"] = project_name

# Get file patterns directly from shared

include_patterns = shared["include_patterns"]

exclude_patterns = shared["exclude_patterns"]

max_file_size = shared["max_file_size"]

return {

"repo_url": repo_url,

"local_dir": local_dir,

"token": shared.get("github_token"),

"include_patterns": include_patterns,

"exclude_patterns": exclude_patterns,

"max_file_size": max_file_size,

"use_relative_paths": True

}

def exec(self, prep_res):

if prep_res["repo_url"]:

print(f"Crawling repository: {prep_res['repo_url']}...")

result = crawl_github_files(

repo_url=prep_res["repo_url"],

token=prep_res["token"],

include_patterns=prep_res["include_patterns"],

exclude_patterns=prep_res["exclude_patterns"],

max_file_size=prep_res["max_file_size"],

use_relative_paths=prep_res["use_relative_paths"]

)

else:

print(f"Crawling directory: {prep_res['local_dir']}...")

result = crawl_local_files(

directory=prep_res["local_dir"],

include_patterns=prep_res["include_patterns"],

exclude_patterns=prep_res["exclude_patterns"],

max_file_size=prep_res["max_file_size"],

use_relative_paths=prep_res["use_relative_paths"]

)

# Convert dict to list of tuples: [(path, content), ...]

files_list = list(result.get("files", {}).items())

if len(files_list) == 0:

raise(ValueError("Failed to fetch files"))

print(f"Fetched {len(files_list)} files.")

return files_list

def post(self, shared, prep_res, exec_res):

shared["files"] = exec_res # List of (path, content) tuples

class IdentifyAbstractions(Node):

def prep(self, shared):

files_data = shared["files"]

project_name = shared["project_name"] # Get project name

language = shared.get("language", "english") # Get language

# Helper to create context from files, respecting limits (basic example)

def create_llm_context(files_data):

context = ""

file_info = [] # Store tuples of (index, path)

for i, (path, content) in enumerate(files_data):

entry = f"--- File Index {i}: {path} ---\n{content}\n\n"

context += entry

file_info.append((i, path))

return context, file_info # file_info is list of (index, path)

context, file_info = create_llm_context(files_data)

# Format file info for the prompt (comment is just a hint for LLM)

file_listing_for_prompt = "\n".join([f"- {idx} # {path}" for idx, path in file_info])

return context, file_listing_for_prompt, len(files_data), project_name, language # Return language

def exec(self, prep_res):

context, file_listing_for_prompt, file_count, project_name, language = prep_res # Unpack project name and language

print(f"Identifying abstractions using LLM...")

# Add language instruction and hints only if not English

language_instruction = ""

name_lang_hint = ""

desc_lang_hint = ""

if language.lower() != "english":

language_instruction = f"IMPORTANT: Generate the `name` and `description` for each abstraction in **{language.capitalize()}** language. Do NOT use English for these fields.\n\n"

# Keep specific hints here as name/description are primary targets

name_lang_hint = f" (value in {language.capitalize()})"

desc_lang_hint = f" (value in {language.capitalize()})"

prompt = f"""

For the project `{project_name}`:

Codebase Context:

{context}

{language_instruction}Analyze the codebase context.

Identify the top 5 to 15 core most important abstractions to help those new to the codebase.

For each abstraction, provide:

1. A concise `name`{name_lang_hint}.

2. A beginner-friendly `description` explaining what it is with a simple analogy, in around 100 words{desc_lang_hint}.

3. A list of relevant `file_indices` (integers) using the format `idx # path/comment`.

List of file indices and paths present in the context:

{file_listing_for_prompt}

Format the output as a YAML list of dictionaries:

```yaml

- name: |

Query Processing{name_lang_hint}

description: |

Explains what the abstraction does.

It's like a central dispatcher routing requests.{desc_lang_hint}

file_indices:

- 0 # path/to/file1.py

- 3 # path/to/related.py

- name: |

Query Optimization{name_lang_hint}

description: |

Another core concept, similar to a blueprint for objects.{desc_lang_hint}

file_indices:

- 5 # path/to/another.js

# ... up to 10 abstractions

```"""

response = call_llm(prompt)

# --- Validation ---

yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip()

abstractions = yaml.safe_load(yaml_str)

if not isinstance(abstractions, list):

raise ValueError("LLM Output is not a list")

validated_abstractions = []

for item in abstractions:

if not isinstance(item, dict) or not all(k in item for k in ["name", "description", "file_indices"]):

raise ValueError(f"Missing keys in abstraction item: {item}")

if not isinstance(item["name"], str):

raise ValueError(f"Name is not a string in item: {item}")

if not isinstance(item["description"], str):

raise ValueError(f"Description is not a string in item: {item}")

if not isinstance(item["file_indices"], list):

raise ValueError(f"file_indices is not a list in item: {item}")

# Validate indices

validated_indices = []

for idx_entry in item["file_indices"]:

try:

if isinstance(idx_entry, int):

idx = idx_entry

elif isinstance(idx_entry, str) and '#' in idx_entry:

idx = int(idx_entry.split('#')[0].strip())

else:

idx = int(str(idx_entry).strip())

if not (0 <= idx < file_count):

raise ValueError(f"Invalid file index {idx} found in item {item['name']}. Max index is {file_count - 1}.")

validated_indices.append(idx)

except (ValueError, TypeError):

raise ValueError(f"Could not parse index from entry: {idx_entry} in item {item['name']}")

item["files"] = sorted(list(set(validated_indices)))

# Store only the required fields

validated_abstractions.append({

"name": item["name"], # Potentially translated name

"description": item["description"], # Potentially translated description

"files": item["files"]

})

print(f"Identified {len(validated_abstractions)} abstractions.")

return validated_abstractions

def post(self, shared, prep_res, exec_res):

shared["abstractions"] = exec_res # List of {"name": str, "description": str, "files": [int]}

class AnalyzeRelationships(Node):

def prep(self, shared):

abstractions = shared["abstractions"] # Now contains 'files' list of indices, name/description potentially translated

print("\nAbstractions Found:")

for idx, abstraction in enumerate(abstractions, 1):

print(f"\n{idx}. {abstraction['name']}")

print("-" * (len(str(idx)) + 2 + len(abstraction['name'])))

print(f"Description: {abstraction['description']}")

print(f"Related Files: {abstraction['files']}")

files_data = shared["files"]

project_name = shared["project_name"] # Get project name

language = shared.get("language", "english") # Get language

# Create context with abstraction names, indices, descriptions, and relevant file snippets

print("\nIdentified Abstractions:")

context = "Identified Abstractions:\n"

all_relevant_indices = set()

abstraction_info_for_prompt = []

for i, abstr in enumerate(abstractions):

print(f"{i}: {abstr['name']}")

print(f" Description: {abstr['description'][:100]}...")

# Use 'files' which contains indices directly

file_indices_str = ", ".join(map(str, abstr['files']))

# Abstraction name and description might be translated already

info_line = f"- Index {i}: {abstr['name']} (Relevant file indices: [{file_indices_str}])\n Description: {abstr['description']}"

context += info_line + "\n"

abstraction_info_for_prompt.append(f"{i} # {abstr['name']}") # Use potentially translated name here too

all_relevant_indices.update(abstr['files'])

context += "\nRelevant File Snippets (Referenced by Index and Path):\n"

# Get content for relevant files using helper

relevant_files_content_map = get_content_for_indices(

files_data,

sorted(list(all_relevant_indices))

)

# Format file content for context

file_context_str = "\n\n".join(

f"--- File: {idx_path} ---\n{content}"

for idx_path, content in relevant_files_content_map.items()

)

context += file_context_str

print("\n") # Add blank line after abstractions list

return context, "\n".join(abstraction_info_for_prompt), project_name, language # Return language

def exec(self, prep_res):

context, abstraction_listing, project_name, language = prep_res # Unpack project name and language

print(f"Analyzing relationships using LLM...")

# Add language instruction and hints only if not English

language_instruction = ""

lang_hint = ""

list_lang_note = ""

if language.lower() != "english":

language_instruction = f"IMPORTANT: Generate the `summary` and relationship `label` fields in **{language.capitalize()}** language. Do NOT use English for these fields.\n\n"

lang_hint = f" (in {language.capitalize()})"

list_lang_note = f" (Names might be in {language.capitalize()})" # Note for the input list

prompt = f"""

Based on the following abstractions and relevant code snippets from the project `{project_name}`:

List of Abstraction Indices and Names{list_lang_note}:

{abstraction_listing}

Context (Abstractions, Descriptions, Code):

{context}

{language_instruction}Please provide:

1. A high-level `summary` of the project's main purpose and functionality in a few beginner-friendly sentences{lang_hint}. Use markdown formatting with **bold** and *italic* text to highlight important concepts.

2. A list (`relationships`) describing the key interactions between these abstractions. For each relationship, specify:

- `from_abstraction`: Index of the source abstraction (e.g., `0 # AbstractionName1`)

- `to_abstraction`: Index of the target abstraction (e.g., `1 # AbstractionName2`)

- `label`: A brief label for the interaction **in just a few words**{lang_hint} (e.g., "Manages", "Inherits", "Uses").

Ideally the relationship should be backed by one abstraction calling or passing parameters to another.

Simplify the relationship and exclude those non-important ones.

IMPORTANT: Make sure EVERY abstraction is involved in at least ONE relationship (either as source or target). Each abstraction index must appear at least once across all relationships.

Format the output as YAML:

```yaml

summary: |

A brief, simple explanation of the project{lang_hint}.

Can span multiple lines with **bold** and *italic* for emphasis.

relationships:

- from_abstraction: 0 # AbstractionName1

to_abstraction: 1 # AbstractionName2

label: "Manages"{lang_hint}

- from_abstraction: 2 # AbstractionName3

to_abstraction: 0 # AbstractionName1

label: "Provides config"{lang_hint}

# ... other relationships

```

Now, provide the YAML output:

"""

response = call_llm(prompt)

# --- Validation ---

yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip()

relationships_data = yaml.safe_load(yaml_str)

if not isinstance(relationships_data, dict) or not all(k in relationships_data for k in ["summary", "relationships"]):

raise ValueError("LLM output is not a dict or missing keys ('summary', 'relationships')")

if not isinstance(relationships_data["summary"], str):

raise ValueError("summary is not a string")

if not isinstance(relationships_data["relationships"], list):

raise ValueError("relationships is not a list")

# Validate relationships structure

validated_relationships = []

num_abstractions = len(abstraction_listing.split('\n'))

for rel in relationships_data["relationships"]:

# Check for 'label' key

if not isinstance(rel, dict) or not all(k in rel for k in ["from_abstraction", "to_abstraction", "label"]):

raise ValueError(f"Missing keys (expected from_abstraction, to_abstraction, label) in relationship item: {rel}")

# Validate 'label' is a string

if not isinstance(rel["label"], str):

raise ValueError(f"Relationship label is not a string: {rel}")

# Validate indices

try:

from_idx = int(str(rel["from_abstraction"]).split('#')[0].strip())

to_idx = int(str(rel["to_abstraction"]).split('#')[0].strip())

if not (0 <= from_idx < num_abstractions and 0 <= to_idx < num_abstractions):

raise ValueError(f"Invalid index in relationship: from={from_idx}, to={to_idx}. Max index is {num_abstractions-1}.")

validated_relationships.append({

"from": from_idx,

"to": to_idx,

"label": rel["label"] # Potentially translated label

})

except (ValueError, TypeError):

raise ValueError(f"Could not parse indices from relationship: {rel}")

print("Generated project summary and relationship details.")

return {

"summary": relationships_data["summary"], # Potentially translated summary

"details": validated_relationships # Store validated, index-based relationships with potentially translated labels

}

def post(self, shared, prep_res, exec_res):

# Structure is now {"summary": str, "details": [{"from": int, "to": int, "label": str}]}

# Summary and label might be translated

shared["relationships"] = exec_res

class OrderChapters(Node):

def prep(self, shared):

abstractions = shared["abstractions"] # Name/description might be translated

relationships = shared["relationships"] # Summary/label might be translated

project_name = shared["project_name"] # Get project name

language = shared.get("language", "english") # Get language

# Prepare context for the LLM

abstraction_info_for_prompt = []

for i, a in enumerate(abstractions):

abstraction_info_for_prompt.append(f"- {i} # {a['name']}") # Use potentially translated name

abstraction_listing = "\n".join(abstraction_info_for_prompt)

# Use potentially translated summary and labels

summary_note = ""

if language.lower() != "english":

summary_note = f" (Note: Project Summary might be in {language.capitalize()})"

context = f"Project Summary{summary_note}:\n{relationships['summary']}\n\n"

context += "Relationships (Indices refer to abstractions above):\n"

for rel in relationships['details']:

from_name = abstractions[rel['from']]['name']

to_name = abstractions[rel['to']]['name']

# Use potentially translated 'label'

context += f"- From {rel['from']} ({from_name}) to {rel['to']} ({to_name}): {rel['label']}\n" # Label might be translated

list_lang_note = ""

if language.lower() != "english":

list_lang_note = f" (Names might be in {language.capitalize()})"

return abstraction_listing, context, len(abstractions), project_name, list_lang_note

def exec(self, prep_res):

abstraction_listing, context, num_abstractions, project_name, list_lang_note = prep_res

print("Determining chapter order using LLM...")

# No language variation needed here in prompt instructions, just ordering based on structure

# The input names might be translated, hence the note.

prompt = f"""

Given the following project abstractions and their relationships for the project ```` {project_name} ````:

Abstractions (Index # Name){list_lang_note}:

{abstraction_listing}

Context about relationships and project summary:

{context}

If you are going to make a tutorial for ```` {project_name} ````, what is the best order to explain these abstractions, from first to last?

Ideally, first explain those that are the most important or foundational, perhaps user-facing concepts or entry points. Then move to more detailed, lower-level implementation details or supporting concepts.

Output the ordered list of abstraction indices, including the name in a comment for clarity. Use the format `idx # AbstractionName`.

```yaml

- 2 # FoundationalConcept

- 0 # CoreClassA

- 1 # CoreClassB (uses CoreClassA)

- ...

```

Now, provide the YAML output:

"""

response = call_llm(prompt)

# --- Validation ---

yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip()

ordered_indices_raw = yaml.safe_load(yaml_str)

if not isinstance(ordered_indices_raw, list):

raise ValueError("LLM output is not a list")

ordered_indices = []

seen_indices = set()

for entry in ordered_indices_raw:

try:

if isinstance(entry, int):

idx = entry

elif isinstance(entry, str) and '#' in entry:

idx = int(entry.split('#')[0].strip())

else:

idx = int(str(entry).strip())

if not (0 <= idx < num_abstractions):

raise ValueError(f"Invalid index {idx} in ordered list. Max index is {num_abstractions-1}.")

if idx in seen_indices:

raise ValueError(f"Duplicate index {idx} found in ordered list.")

ordered_indices.append(idx)

seen_indices.add(idx)

except (ValueError, TypeError):

raise ValueError(f"Could not parse index from ordered list entry: {entry}")

# Check if all abstractions are included

if len(ordered_indices) != num_abstractions:

raise ValueError(f"Ordered list length ({len(ordered_indices)}) does not match number of abstractions ({num_abstractions}). Missing indices: {set(range(num_abstractions)) - seen_indices}")

print(f"Determined chapter order (indices): {ordered_indices}")

return ordered_indices # Return the list of indices

def post(self, shared, prep_res, exec_res):

# exec_res is already the list of ordered indices

shared["chapter_order"] = exec_res # List of indices

class WriteChapters(BatchNode):

def prep(self, shared):

chapter_order = shared["chapter_order"] # List of indices

abstractions = shared["abstractions"] # List of dicts, name/desc potentially translated

files_data = shared["files"]

language = shared.get("language", "english") # Get language

doc_type = shared.get("doc_type", "tutorial") # Get document type

# Get already written chapters to provide context

# We store them temporarily during the batch run, not in shared memory yet

# The 'previous_chapters_summary' will be built progressively in the exec context

self.chapters_written_so_far = [] # Use instance variable for temporary storage across exec calls

# Flag to track if we need to insert C4 model chapter

self.insert_c4_chapter = (doc_type == "sad")

# Create a complete list of all chapters

all_chapters = []

chapter_filenames = {} # Store chapter filename mapping for linking

for i, abstraction_index in enumerate(chapter_order):

if 0 <= abstraction_index < len(abstractions):

chapter_num = i + 1

chapter_name = abstractions[abstraction_index]["name"] # Potentially translated name

# Create safe filename (from potentially translated name)

safe_name = "".join(c if c.isalnum() else '_' for c in chapter_name).lower()

filename = f"{i+1:02d}_{safe_name}.md"

# Format with link (using potentially translated name)

all_chapters.append(f"{chapter_num}. [{chapter_name}]({filename})")

# Store mapping of chapter index to filename for linking

chapter_filenames[abstraction_index] = {"num": chapter_num, "name": chapter_name, "filename": filename}

# Create a formatted string with all chapters

full_chapter_listing = "\n".join(all_chapters)

items_to_process = []

for i, abstraction_index in enumerate(chapter_order):

if 0 <= abstraction_index < len(abstractions):

abstraction_details = abstractions[abstraction_index] # Contains potentially translated name/desc

# Use 'files' (list of indices) directly

related_file_indices = abstraction_details.get("files", [])

# Get content using helper, passing indices

related_files_content_map = get_content_for_indices(files_data, related_file_indices)

# Get previous chapter info for transitions (uses potentially translated name)

prev_chapter = None

if i > 0:

prev_idx = chapter_order[i-1]

prev_chapter = chapter_filenames[prev_idx]

# Get next chapter info for transitions (uses potentially translated name)

next_chapter = None

if i < len(chapter_order) - 1:

next_idx = chapter_order[i+1]

next_chapter = chapter_filenames[next_idx]

items_to_process.append({

"chapter_num": i + 1,

"abstraction_index": abstraction_index,

"abstraction_details": abstraction_details, # Has potentially translated name/desc

"related_files_content_map": related_files_content_map,

"project_name": shared["project_name"], # Add project name

"full_chapter_listing": full_chapter_listing, # Add the full chapter listing (uses potentially translated names)

"chapter_filenames": chapter_filenames, # Add chapter filenames mapping (uses potentially translated names)

"prev_chapter": prev_chapter, # Add previous chapter info (uses potentially translated name)

"next_chapter": next_chapter, # Add next chapter info (uses potentially translated name)

"language": language, # Add language for multi-language support

"doc_type": doc_type, # Add document type for supporting different document types

# previous_chapters_summary will be added dynamically in exec

})

else:

print(f"Warning: Invalid abstraction index {abstraction_index} in chapter_order. Skipping.")

if self.insert_c4_chapter:

c4_chapter = {

"chapter_num": 1,

"is_c4_chapter": True, # Special flag to identify this as the C4 model chapter

"project_name": shared["project_name"],

"full_chapter_listing": full_chapter_listing,

"chapter_filenames": chapter_filenames,

"language": language,

"doc_type": doc_type,

"abstractions": abstractions,

"relationships": shared["relationships"],

# Get all file contents for context

"all_files_content_map": get_content_for_indices(files_data, range(min(20, len(files_data)))),

# Add a special abstraction_details for the C4 model chapter

"abstraction_details": {

"name": "Architecture Overview",

"description": "A high-level architecture overview following the C4 model, showing the system context, containers, and components."

}

items_to_process.insert(0, c4_chapter)

# Update chapter numbers for all items to reflect the new sequence

for index, item in enumerate(items_to_process, start=1):

item["chapter_num"] = index

print(f"Preparing to write {len(items_to_process)} chapters...")

return items_to_process # Iterable for BatchNode

def exec(self, item):

# This runs for each item prepared above

chapter_num = item["chapter_num"]

project_name = item.get("project_name")

language = item.get("language", "english")

doc_type = item.get("doc_type", "tutorial")

# Special handling for C4 model chapter

if item.get("is_c4_chapter", False):

# Extract name and description from abstraction_details for C4 model chapter

abstraction_name = item["abstraction_details"]["name"] # Get name from abstraction_details

abstraction_description = item["abstraction_details"]["description"] # Get description from abstraction_details

print(f"Writing {doc_type} chapter {chapter_num}: {abstraction_name} using LLM...")

# Prepare file context string from all files

file_context_str = "\n\n".join(

f"--- File: {idx_path.split('# ')[1] if '# ' in idx_path else idx_path} ---\n{content}"

for idx_path, content in item["all_files_content_map"].items()

)

# Get abstractions and relationships for context

abstractions = item["abstractions"]

relationships = item["relationships"]

# Create abstraction summary for context

abstraction_summary = "\n".join([

f"- {i}: {abstr['name']} - {abstr['description'][:100]}..."

for i, abstr in enumerate(abstractions)

])

else:

# Regular chapter processing

abstraction_name = item["abstraction_details"]["name"] # Potentially translated name

abstraction_description = item["abstraction_details"]["description"] # Potentially translated description

print(f"Writing {doc_type} chapter {chapter_num} for: {abstraction_name} using LLM...")

# Prepare file context string from the map

file_context_str = "\n\n".join(

f"--- File: {idx_path.split('# ')[1] if '# ' in idx_path else idx_path} ---\n{content}"

for idx_path, content in item["related_files_content_map"].items()

)

# Get summary of chapters written *before* this one

# Use the temporary instance variable

previous_chapters_summary = "\n---\n".join(self.chapters_written_so_far)

# Add language instruction and context notes only if not English

language_instruction = ""

concept_details_note = ""

structure_note = ""

prev_summary_note = ""

instruction_lang_note = ""

mermaid_lang_note = ""

code_comment_note = ""

link_lang_note = ""

tone_note = ""

if language.lower() != "english":

lang_cap = language.capitalize()

language_instruction = f"IMPORTANT: Write this ENTIRE {doc_type} chapter in **{lang_cap}**. Some input context (like concept name, description, chapter list, previous summary) might already be in {lang_cap}, but you MUST translate ALL other generated content including explanations, examples, technical terms, and potentially code comments into {lang_cap}. DO NOT use English anywhere except in code syntax, required proper nouns, or when specified. The entire output MUST be in {lang_cap}.\n\n"

concept_details_note = f" (Note: Provided in {lang_cap})"

structure_note = f" (Note: Chapter names might be in {lang_cap})"

prev_summary_note = f" (Note: This summary might be in {lang_cap})"

instruction_lang_note = f" (in {lang_cap})"

mermaid_lang_note = f" (Use {lang_cap} for labels/text if appropriate)"

code_comment_note = f" (Translate to {lang_cap} if possible, otherwise keep minimal English for clarity)"

link_lang_note = f" (Use the {lang_cap} chapter title from the structure above)"

tone_note = f" (appropriate for {lang_cap} readers)"

# Choose the appropriate prompt based on the document type and whether it's the C4 model chapter

if item.get("is_c4_chapter", False):

# Special C4 model chapter prompt

prompt = f"""

{language_instruction}Write a professional Software Architecture Document (SAD) chapter (in Markdown format) for the project `{project_name}` that provides a high-level architecture overview following the C4 model. This is Chapter {chapter_num}.

Complete Document Structure{structure_note}:

{item["full_chapter_listing"]}

Context from previous chapters{prev_summary_note}:

{previous_chapters_summary if previous_chapters_summary else "This is the first chapter."}

Project Abstractions Summary:

{abstraction_summary}

Relevant Code Snippets (Code itself remains unchanged):

{file_context_str if file_context_str else "No specific code snippets provided."}

Instructions for the chapter (Generate content in {language.capitalize()} unless specified otherwise):

- Start with a clear heading: `# Chapter 1: Architecture Overview`.

- If this is not the first chapter, begin with a brief transition from the previous chapter{instruction_lang_note}, referencing it with a proper Markdown link using its name{link_lang_note}.

- Begin with an "Introduction" section that provides an overview of the purpose and scope of the architecture document{instruction_lang_note}.

- Include a section on "Architectural Representation" that explains the C4 model and how it's used to represent the architecture{instruction_lang_note}. Explain that the C4 model provides a way to visualize the architecture at different levels of abstraction: Context, Containers, Components, and Code.

- Include a section on "Architectural Goals and Constraints" that describes:

* The key goals that the architecture aims to achieve

* Business, technical, and operational constraints that impact the architecture

* Quality attributes that are prioritized in the design

- Include a section on "Use-Case View" that:

* Identifies the key use cases or user stories that drive the architecture

* Describes how the architecture supports these use cases

* Includes a diagram or list of the most architecturally significant use cases

- Include a section on "System Context (Logical View - Level 1)" that:

* Shows the system as a whole and its interactions with users and external systems

* Provides a high-level view of the system boundaries

* Includes a context diagram using mermaid (```mermaid``` format) {mermaid_lang_note}

- Include a section on "Containers (Logical View - Level 2)" that:

* Breaks down the system into containers (applications, data stores, microservices)

* Shows how containers communicate with each other

* Includes a container diagram using mermaid (```mermaid``` format) {mermaid_lang_note}

- Include a section on "Components (Logical View - Level 3)" that:

* Breaks down the main containers into components

* Shows the major structural building blocks and their interactions

* Includes a component diagram using mermaid (```mermaid``` format) {mermaid_lang_note}

* IMPORTANT: When referring to components covered in other chapters, ALWAYS use proper Markdown links like this: [Chapter Title](filename.md). Use the Complete Document Structure above to find the correct filename and the chapter title{link_lang_note}.

- Include a section on "Process View" that:

* Describes the system's dynamic behavior

* Explains key processes, workflows, or runtime scenarios

* Includes sequence diagrams for important processes using mermaid (```mermaid``` format) {mermaid_lang_note}

- Include a section on "Deployment View" that:

* Describes the physical deployment of the system

* Shows how software components map to hardware infrastructure

* Includes a deployment diagram using mermaid (```mermaid``` format) {mermaid_lang_note}

- Include a section on "Size and Performance" that:

* Describes the expected or measured performance characteristics

* Explains how the architecture addresses scalability requirements

* Identifies any performance constraints or bottlenecks

- Include a section on "Quality" that describes:

* The architectural style(s) used (e.g., microservices, layered, event-driven)

* How the architecture addresses key quality attributes (security, reliability, maintainability, etc.)

* Cross-cutting concerns (logging, error handling, etc.)

- Include a section on "Technology Stack" that provides an overview of the technologies used in the system.

- End with a section on "Architecture Decision Records" that summarizes key architectural decisions, their rationale, and alternatives considered.

- Maintain a formal, professional tone throughout the document{tone_note}.

- Output *only* the Markdown content for this chapter.

Now, directly provide a professional Architecture Overview chapter in Markdown format (DON'T need ```markdown``` tags):

"""

elif doc_type == "sad":

prompt = f"""

{language_instruction}Write a professional Software Architecture Document (SAD) chapter (in Markdown format) for the project `{project_name}` about the component: "{abstraction_name}". This is Chapter {chapter_num}.

Component Details{concept_details_note}:

- Name: {abstraction_name}

- Description:

{abstraction_description}

Complete Document Structure{structure_note}:

{item["full_chapter_listing"]}

Context from previous chapters{prev_summary_note}:

{previous_chapters_summary if previous_chapters_summary else "This is the first chapter."}

Relevant Code Snippets (Code itself remains unchanged):

{file_context_str if file_context_str else "No specific code snippets provided for this component."}

Instructions for the chapter (Generate content in {language.capitalize()} unless specified otherwise):

- Start with a clear heading (e.g., `# Chapter {chapter_num}: {abstraction_name}`). Use the provided component name.

- If this is not the first chapter, begin with a brief transition from the previous chapter{instruction_lang_note}, referencing it with a proper Markdown link using its name{link_lang_note}.

- Begin with a formal component overview that describes the purpose, responsibilities, and architectural significance of this component{instruction_lang_note}.

- Include a section on "Architectural Design" that covers:

* Component structure and organization

* Design patterns used

* Key interfaces and their purposes

* Dependencies on other components

* Quality attributes addressed (performance, security, scalability, etc.)

- Include a section on "Implementation Details" that provides:

* Key classes/modules and their responsibilities

* Critical algorithms or processes

* Data structures used

* Error handling and fault tolerance mechanisms

- Use UML or similar diagrams to illustrate the component's structure and interactions. Use mermaid diagrams (```mermaid``` format) for class diagrams, sequence diagrams, etc. {mermaid_lang_note}.

- Include code examples that highlight important architectural aspects. Focus on interfaces, patterns, and structure rather than implementation details.

- IMPORTANT: When you need to refer to other components covered in other chapters, ALWAYS use proper Markdown links like this: [Chapter Title](filename.md). Use the Complete Document Structure above to find the correct filename and the chapter title{link_lang_note}.

- Include a section on "Design Decisions and Tradeoffs" that explains:

* Why this architectural approach was chosen

* Alternatives that were considered

* Tradeoffs made and their justifications

- End with a section on "Integration Points" that describes how this component interacts with other parts of the system{instruction_lang_note}. If there is a next chapter, use a proper Markdown link: [Next Chapter Title](next_chapter_filename){link_lang_note}.

- Maintain a formal, professional tone throughout the document{tone_note}.

- Output *only* the Markdown content for this chapter.

Now, directly provide a professional Software Architecture Document chapter in Markdown format (DON'T need ```markdown``` tags):

"""

else: # Default to tutorial

prompt = f"""

{language_instruction}Write a very beginner-friendly tutorial chapter (in Markdown format) for the project `{project_name}` about the concept: "{abstraction_name}". This is Chapter {chapter_num}.

Concept Details{concept_details_note}:

- Name: {abstraction_name}

- Description:

{abstraction_description}

Complete Tutorial Structure{structure_note}:

{item["full_chapter_listing"]}

Context from previous chapters{prev_summary_note}:

{previous_chapters_summary if previous_chapters_summary else "This is the first chapter."}

Relevant Code Snippets (Code itself remains unchanged):

{file_context_str if file_context_str else "No specific code snippets provided for this abstraction."}

Instructions for the chapter (Generate content in {language.capitalize()} unless specified otherwise):

- Start with a clear heading (e.g., `# Chapter {chapter_num}: {abstraction_name}`). Use the provided concept name.

- If this is not the first chapter, begin with a brief transition from the previous chapter{instruction_lang_note}, referencing it with a proper Markdown link using its name{link_lang_note}.

- Begin with a high-level motivation explaining what problem this abstraction solves{instruction_lang_note}. Start with a central use case as a concrete example. The whole chapter should guide the reader to understand how to solve this use case. Make it very minimal and friendly to beginners.

- If the abstraction is complex, break it down into key concepts. Explain each concept one-by-one in a very beginner-friendly way{instruction_lang_note}.

- Explain how to use this abstraction to solve the use case{instruction_lang_note}. Give example inputs and outputs for code snippets (if the output isn't values, describe at a high level what will happen{instruction_lang_note}).

- Each code block should be BELOW 20 lines! If longer code blocks are needed, break them down into smaller pieces and walk through them one-by-one. Aggresively simplify the code to make it minimal. Use comments{code_comment_note} to skip non-important implementation details. Each code block should have a beginner friendly explanation right after it{instruction_lang_note}.

- Describe the internal implementation to help understand what's under the hood{instruction_lang_note}. First provide a non-code or code-light walkthrough on what happens step-by-step when the abstraction is called{instruction_lang_note}. It's recommended to use a simple sequenceDiagram with a dummy example - keep it minimal with at most 5 participants to ensure clarity. If participant name has space, use: `participant QP as Query Processing`. {mermaid_lang_note}.

- Then dive deeper into code for the internal implementation with references to files. Provide example code blocks, but make them similarly simple and beginner-friendly. Explain{instruction_lang_note}.

- IMPORTANT: When you need to refer to other core abstractions covered in other chapters, ALWAYS use proper Markdown links like this: [Chapter Title](filename.md). Use the Complete Tutorial Structure above to find the correct filename and the chapter title{link_lang_note}. Translate the surrounding text.

- Use mermaid diagrams to illustrate complex concepts (```mermaid``` format). {mermaid_lang_note}.

- Heavily use analogies and examples throughout{instruction_lang_note} to help beginners understand.

- End the chapter with a brief conclusion that summarizes what was learned{instruction_lang_note} and provides a transition to the next chapter{instruction_lang_note}. If there is a next chapter, use a proper Markdown link: [Next Chapter Title](next_chapter_filename){link_lang_note}.

- Ensure the tone is welcoming and easy for a newcomer to understand{tone_note}.

- Output *only* the Markdown content for this chapter.

Now, directly provide a super beginner-friendly Markdown output (DON'T need ```markdown``` tags):

"""

chapter_content = call_llm(prompt)

# Basic validation/cleanup

actual_heading = f"# Chapter {chapter_num}: {abstraction_name}" # Use potentially translated name

if not chapter_content.strip().startswith(f"# Chapter {chapter_num}"):

# Add heading if missing or incorrect, trying to preserve content

lines = chapter_content.strip().split('\n')

if lines and lines[0].strip().startswith("#"): # If there's some heading, replace it

lines[0] = actual_heading

chapter_content = "\n".join(lines)

else: # Otherwise, prepend it

chapter_content = f"{actual_heading}\n\n{chapter_content}"

# Add the generated content to our temporary list for the next iteration's context

self.chapters_written_so_far.append(chapter_content)

return chapter_content # Return the Markdown string (potentially translated)

def post(self, shared, prep_res, exec_res_list):

# exec_res_list contains the generated Markdown for each chapter, in order

shared["chapters"] = exec_res_list

# Clean up the temporary instance variable

del self.chapters_written_so_far

print(f"Finished writing {len(exec_res_list)} chapters.")

class CombineTutorial(Node):

def prep(self, shared):

project_name = shared["project_name"]

output_base_dir = shared.get("output_dir", "output") # Default output dir

output_path = os.path.join(output_base_dir, project_name)

repo_url = shared.get("repo_url") # Get the repository URL

doc_type = shared.get("doc_type", "tutorial") # Get document type

# language = shared.get("language", "english") # No longer needed for fixed strings

# Get potentially translated data

relationships_data = shared["relationships"] # {"summary": str, "details": [{"from": int, "to": int, "label": str}]} -> summary/label potentially translated

chapter_order = shared["chapter_order"] # indices

abstractions = shared["abstractions"] # list of dicts -> name/description potentially translated

chapters_content = shared["chapters"] # list of strings -> content potentially translated

# --- Generate Mermaid Diagram ---

mermaid_lines = ["flowchart TD"]

# Add nodes for each abstraction using potentially translated names

for i, abstr in enumerate(abstractions):

node_id = f"A{i}"

# Use potentially translated name, sanitize for Mermaid ID and label

sanitized_name = abstr['name'].replace('"', '')

node_label = sanitized_name # Using sanitized name only

mermaid_lines.append(f' {node_id}["{node_label}"]') # Node label uses potentially translated name

# Add edges for relationships using potentially translated labels

for rel in relationships_data['details']:

from_node_id = f"A{rel['from']}"

to_node_id = f"A{rel['to']}"

# Use potentially translated label, sanitize

edge_label = rel['label'].replace('"', '').replace('\n', ' ') # Basic sanitization

max_label_len = 30

if len(edge_label) > max_label_len:

edge_label = edge_label[:max_label_len-3] + "..."

mermaid_lines.append(f' {from_node_id} -- "{edge_label}" --> {to_node_id}') # Edge label uses potentially translated label

mermaid_diagram = "\n".join(mermaid_lines)

# --- End Mermaid ---

# --- Prepare index.md content ---

if doc_type == "sad":

index_content = f"# Software Architecture Document: {project_name}\n\n"

index_content += f"## Executive Summary\n\n"

index_content += f"{relationships_data['summary']}\n\n" # Use the potentially translated summary directly

# Keep fixed strings in English

index_content += f"**Source Repository:** [{repo_url}]({repo_url})\n\n"

index_content += f"## System Overview\n\n"

index_content += f"This document describes the software architecture of the {project_name} project. It provides a comprehensive architectural overview of the system, using different views to depict different aspects of the system.\n\n"

# Add Mermaid diagram for relationships (diagram itself uses potentially translated names/labels)

index_content += f"## Component Diagram\n\n"

index_content += "```mermaid\n"

index_content += mermaid_diagram + "\n"

index_content += "```\n\n"

# Keep fixed strings in English

index_content += f"## Table of Contents\n\n"

else:

index_content = f"# Tutorial: {project_name}\n\n"

index_content += f"{relationships_data['summary']}\n\n" # Use the potentially translated summary directly

# Keep fixed strings in English

index_content += f"**Source Repository:** [{repo_url}]({repo_url})\n\n"

# Add Mermaid diagram for relationships (diagram itself uses potentially translated names/labels)

index_content += "```mermaid\n"

index_content += mermaid_diagram + "\n"

index_content += "```\n\n"

# Keep fixed strings in English

index_content += f"## Chapters\n\n"

chapter_files = []

# Special handling for C4 Model chapter if this is a SAD

c4_model_content = None

if doc_type == "sad":

# Look for C4 Model chapter content

for i, content in enumerate(chapters_content):

if "# Chapter 1: Architecture Overview" in content:

c4_model_content = content

# Add attribution to chapter content

if not c4_model_content.endswith("\n\n"):

c4_model_content += "\n\n"

c4_model_content += f"---\n\nGenerated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge) - Software Architecture Document Mode"

# Create special file for C4 Model chapter

c4_filename = "01_architecture_overview.md"

chapter_files.append({"filename": c4_filename, "content": c4_model_content})

# Add to table of contents

index_content += f"1. [Architecture Overview]({c4_filename})\n"

# Remove from chapters_content so it doesn't get processed again

chapters_content = [c for j, c in enumerate(chapters_content) if j != i]

break

# Generate chapter links based on the determined order, using potentially translated names

chapter_num = 1

internationalization_entry = None # Initialize variable to store internationalization entry

for i, abstraction_index in enumerate(chapter_order):

# Ensure index is valid and we have content for it

if 0 <= abstraction_index < len(abstractions) and i < len(chapters_content):

abstraction_name = abstractions[abstraction_index]["name"] # Potentially translated name

display_chapter_num = chapter_num

# # Skip if this is the Architecture Overview chapter (already handled)

# if abstraction_name == "Architecture Overview":

# continue

# # Special case for internationalization chapter - make it chapter 4

# if "internationalization" in abstraction_name.lower() or "i18n" in abstraction_name.lower():

# display_chapter_num = 4

# else:

# # Adjust chapter number if we've passed where C4 Model should be

# display_chapter_num = chapter_num

# if doc_type == "sad" and chapter_num >= 4:

# display_chapter_num = chapter_num + 1

# Sanitize potentially translated name for filename

safe_name = "".join(c if c.isalnum() else '_' for c in abstraction_name).lower()

# Start numbering from 01 for the first regular chapter

filename = f"{(chapter_num+1):02d}_{safe_name}.md"

# Use different format for SAD

# if doc_type == "sad":

# # Store the TOC entry to be added in the correct order later

# if "internationalization" in abstraction_name.lower() or "i18n" in abstraction_name.lower():

# # Store the internationalization chapter entry to be added after chapter 3

# internationalization_entry = f"4. [{abstraction_name} Component]({filename})\n"

# else:

# index_content += f"{display_chapter_num}. [{abstraction_name} Component]({filename})\n" # Use potentially translated name in link text

# else:

index_content += f"{chapter_num+1}. [{abstraction_name}]({filename})\n" # Use potentially translated name in link text

# Add attribution to chapter content (using English fixed string)

chapter_content = chapters_content[i] # Potentially translated content

# Update chapter number in content to match file naming

# Replace chapter number in heading

chapter_content = chapter_content.replace(f"# Chapter {chapter_num}:", f"# Chapter {chapter_num+1}:")

if not chapter_content.endswith("\n\n"):

chapter_content += "\n\n"

# Keep fixed strings in English

if doc_type == "sad":

chapter_content += f"---\n\nGenerated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge) - Software Architecture Document Mode"

else:

chapter_content += f"---\n\nGenerated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)"

# Store filename and corresponding content

chapter_files.append({"filename": filename, "content": chapter_content})

# Increment chapter number

chapter_num += 1

else:

print(f"Warning: Mismatch between chapter order, abstractions, or content at index {i} (abstraction index {abstraction_index}). Skipping file generation for this entry.")

# Add attribution to index content (using English fixed string)

if doc_type == "sad":

index_content += f"\n\n---\n\nGenerated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge) - Software Architecture Document Mode"

else:

index_content += f"\n\n---\n\nGenerated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)"

return {

"output_path": output_path,

"index_content": index_content,

"chapter_files": chapter_files # List of {"filename": str, "content": str}

}

def exec(self, prep_res):

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

nodes.py

Latest commit

History

nodes.py

File metadata and controls