$timings improvements

gadicc · gadicc · commit 3e87169a8d0d · 2022-11-16T12:12:32.000Z
* train_dreambooth will record train,upload time
* test.py will show all $timings (not just init,inference)
diff --git a/app.py b/app.py
@@ -276,8 +276,9 @@ def inference(all_inputs: dict) -> dict:
         result = TrainDreamBooth(model_id, pipeline, model_inputs)
         send("inference", "done", {"startRequestId": startRequestId})
         inferenceTime = get_now() - inferenceStart
-        timings = {"init": initTime, "inference": inferenceTime}
-        result.update({"timings": timings})
+        timings = result.get("$timings", {})
+        timings = {"init": initTime, "inference": inferenceTime, **timings}
+        result.update({"$timings": timings})
         return result
 
     with torch.inference_mode():
diff --git a/test.py b/test.py
@@ -7,6 +7,7 @@
 import json
 import sys
 import time
+import datetime
 import argparse
 import distutils
 from uuid import uuid4
@@ -76,8 +77,27 @@ def runTest(name, banana, extraCallInputs, extraModelInputs):
             "startOnly": False,
         }
         response = requests.post("https://api.banana.dev/start/v4/", json=payload)
-
         result = response.json()
+        callID = result.get("callID")
+
+        if result.get("finished", None) == False:
+            while result.get("message", None) != "success":
+                secondsSinceStart = round((time.time() - start) / 1000)
+                print(str(datetime.datetime.now()) + f": t+{secondsSinceStart}s")
+                print(json.dumps(result, indent=4))
+                print
+                payload = {
+                    "id": str(uuid4()),
+                    "created": int(time.time()),
+                    "longPoll": True,
+                    "apiKey": BANANA_API_KEY,
+                    "callID": callID,
+                }
+                response = requests.post(
+                    "https://api.banana.dev/check/v4/", json=payload
+                )
+                result = response.json()
+
         modelOutputs = result.get("modelOutputs", None)
         if modelOutputs == None:
             finish = time.time() - start
@@ -91,13 +111,22 @@ def runTest(name, banana, extraCallInputs, extraModelInputs):
 
     finish = time.time() - start
     timings = result.get("$timings")
+
     if timings:
-        init = timings.get("init") / 1000
-        inference = timings.get("inference") / 1000
-        print(
-            f"Request took {finish:.1f}s ("
-            + f"init: {init:.1f}s, inference: {inference:.1f}s)"
-        )
+        timings_str = json.dumps(
+            dict(
+                map(
+                    lambda item: (
+                        item[0],
+                        f"{item[1]/1000:.1f}s"
+                        if item[1] > 1000
+                        else str(item[1]) + "ms",
+                    ),
+                    timings.items(),
+                )
+            )
+        ).replace('"', "")[1:-1]
+        print(f"Request took {finish:.1f}s ({timings_str})")
     else:
         print(f"Request took {finish:.1f}s")
 
diff --git a/train_dreambooth.py b/train_dreambooth.py
@@ -35,7 +35,9 @@
 from torchvision import transforms
 from tqdm.auto import tqdm
 from transformers import CLIPTextModel, CLIPTokenizer
+
 from precision import revision, torch_dtype
+from send import send, get_now
 
 
 # Our original code in docker-diffusers-api:
@@ -119,8 +121,8 @@ def TrainDreamBooth(model_id: str, pipeline, model_inputs):
     args = argparse.Namespace(**params)
 
     print(args)
-    main(args, pipeline)
-    return {"done": True}
+    result = main(args, pipeline)
+    return result
 
 
 # What follows is mostly the original train_dreambooth.py
@@ -571,6 +573,10 @@ def collate_fn(examples):
     progress_bar.set_description("Steps")
     global_step = 0
 
+    # DDA
+    send("training", "start", {}, True)
+    training_start = get_now()
+
     for epoch in range(args.num_train_epochs):
         unet.train()
         if args.train_text_encoder:
@@ -657,6 +663,12 @@ def collate_fn(examples):
 
         accelerator.wait_for_everyone()
 
+    # DDA
+    send("training", "done")
+    training_total = get_now() - training_start
+    upload_start = 0
+    upload_total = 0
+
     # Create the pipeline using using the trained modules and save it.
     if accelerator.is_main_process:
         pipeline = StableDiffusionPipeline.from_pretrained(
@@ -669,6 +681,10 @@ def collate_fn(examples):
         pipeline.save_pretrained(args.output_dir)
 
         if args.push_to_hub:
+            # DDA
+            send("uploading", "start", {}, True)
+            upload_start = get_now()
+
             repo.push_to_hub(
                 commit_message="End of training",
                 # DDA need to think about this, quite nice to not block, then could
@@ -678,4 +694,14 @@ def collate_fn(examples):
                 auto_lfs_prune=True,
             )
 
+            # DDA
+            send("uploading", "done")
+            upload_total = get_now() - upload_start
+
     accelerator.end_training()
+
+    # DDA
+    return {
+        "done": True,
+        "$timings": {"training": training_total, "upload": upload_total},
+    }