Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,333 @@
package com.testsigma.addons.windows;


import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.testsigma.addons.util.ScreenshotUtils;
import com.testsigma.sdk.*;
import com.testsigma.sdk.annotation.AI;
import com.testsigma.sdk.annotation.Action;
import com.testsigma.sdk.annotation.TestData;
import com.testsigma.sdk.annotation.TestStepResult;
import lombok.Data;
import org.apache.commons.lang3.exception.ExceptionUtils;
import javax.imageio.ImageIO;
import javax.imageio.ImageReader;
import javax.imageio.stream.ImageInputStream;
import java.awt.*;
import java.awt.event.InputEvent;
import java.awt.image.BufferedImage;
import java.io.File;
import java.util.Iterator;
import java.util.List;

@Data
@Action(actionText = "Ai: Click on Image/text matching prompt prompt-describing-image",
description = "Locate and click a UI element using a single AI call. " +
"AI reports the image dimensions it analyzed; coordinates are scaled back to screen space automatically. " +
"Fails if the element is not found.",
applicationType = ApplicationType.WINDOWS,
useCustomScreenshot = true)
public class ClickOnImageUsingAi extends WindowsAction {

@TestData(reference = "prompt-describing-image")
private com.testsigma.sdk.TestData queryDescribingElement;

@AI
private com.testsigma.sdk.AI ai;

@TestStepResult
private com.testsigma.sdk.TestStepResult testStepResult;

private static final int HIGHLIGHT_STROKE_WIDTH = 2;
private static final int DOT_RADIUS = 2;

// The AI must report the pixel dimensions of the image it received so we can
// compute the exact scale factor back to original screen coordinates.
private static final String PROMPT =
"You are a UI element locator. Given a screenshot, find the EXACT pixel " +
"bounding box of the requested element.\n\n" +
"STEP 1 — Measure the image:\n" +
" Look at the raw pixel dimensions of the image you received (width × height).\n" +
" You MUST include these as \"imageWidth\" and \"imageHeight\" in your JSON response.\n\n" +
"STEP 2 — Find the element:\n" +
" Locate the element visually using pixel-level analysis. If there is an application opened " +
"consider only the opened application for locating the element \n" +
" For image/graphic elements (text, icons, logos) find the actual picture.\n" +
" If there is an application running in windowed mode than consider only that application for " +
" identification task and ignore task bar and other elements present on wallpaper" +
" For text elements, look for the distinctive font color and style.\n" +
" Report the bounding box as pixel coordinates: top-left corner (x1, y1) and bottom-right corner (x2, y2).\n\n" +
"OUTPUT FORMAT — strict JSON only, no markdown, no explanation:\n" +
"If found:\n" +
" {\"found\": true, \"x1\": <int>, \"y1\": <int>, \"x2\": <int>, \"y2\": <int>, " +
"\"imageWidth\": <int>, \"imageHeight\": <int>, " +
"\"confidence\": <0-100>, \"description\": \"<what you found>\"}\n" +
"If not found:\n" +
" {\"found\": false, \"x1\": 0, \"y1\": 0, \"x2\": 0, \"y2\": 0, " +
"\"imageWidth\": <int>, \"imageHeight\": <int>, " +
"\"confidence\": 0, \"description\": \"<why not found>\"}\n\n" +
"TASK: Find the exact bounding box of: ";

@Override
public Result execute() {

logger.info("=== ClickElementUsingAi: Starting ===");
File screenshotFile = null;
File finalAnnotatedFile = null;

try {

String query = queryDescribingElement.getValue().toString();
logger.info("Query: " + query);

// ── Step 1: Determine logical screen dimensions ──
// Toolkit returns the OS-level logical (DIP) screen size, e.g. 1920×1080 even on
// a 2× HiDPI display. This is what Selenium / WinAppDriver uses for coordinates.
Dimension logicalScreen = Toolkit.getDefaultToolkit().getScreenSize();
int logicalScreenW = logicalScreen.width;
int logicalScreenH = logicalScreen.height;
logger.info("Logical screen size (Toolkit): " + logicalScreenW + "x" + logicalScreenH);

// ── Step 2: Capture the full desktop via Robot ──
// Robot.createScreenCapture returns physical pixels on HiDPI displays
// (e.g. 3840×2160 on a 2× Retina Mac, 3840×2160 on a 200% Windows display).
Robot robot = new Robot();
Rectangle screenRect = new Rectangle(logicalScreen);
BufferedImage desktopCapture = robot.createScreenCapture(screenRect);
int captureW = desktopCapture.getWidth();
int captureH = desktopCapture.getHeight();
logger.info("Robot desktop capture size (physical px): " + captureW + "x" + captureH);

// Display scale = physical / logical. On a non-HiDPI display this is 1.0×1.0.
double displayScaleX = (double) captureW / logicalScreenW;
double displayScaleY = (double) captureH / logicalScreenH;
logger.info(String.format(
"Display scale (capture / logical): %.4fx%.4f", displayScaleX, displayScaleY));

// ── Step 3: Write desktop capture to temp file for AI ──
// Format is chosen by inspecting the captured image's color model:
// • alpha channel present → PNG (JPEG codec does not support alpha)
// • no alpha → JPEG (smaller file; Vertex AI accepts both)
boolean hasAlpha = desktopCapture.getColorModel().hasAlpha();
String imageFormat = hasAlpha ? "PNG" : "JPEG";
String fileExt = hasAlpha ? ".png" : ".jpg";
logger.info(String.format(
"Capture color model: type=%d hasAlpha=%b → writing as %s",
desktopCapture.getType(), hasAlpha, imageFormat));

screenshotFile = File.createTempFile("ai_desktop_capture", fileExt);
if (hasAlpha) {
// PNG supports alpha — write the capture directly.
ImageIO.write(desktopCapture, "PNG", screenshotFile);
} else {
// JPEG requires TYPE_INT_RGB (no alpha). Convert if necessary.
BufferedImage rgbCapture = desktopCapture.getType() == BufferedImage.TYPE_INT_RGB
? desktopCapture
: toRgb(desktopCapture);
ImageIO.write(rgbCapture, "JPEG", screenshotFile);
}
logger.info(String.format(
"%s written: %s (size=%d bytes, dims=%dx%d)",
imageFormat, screenshotFile.getAbsolutePath(),
screenshotFile.length(), captureW, captureH));

// ── Step 4: Build and send the AI request ──
String fullPrompt = PROMPT + query +
"<custom_instructions>\n" +
"{\n" +
" \"provider\": \"vertex-ai\",\n" +
" \"image_detail\": \"high\"\n" +
"}\n" +
"</custom_instructions>";

AIRequest aiRequest = new AIRequest();
aiRequest.setPrompt(fullPrompt);
aiRequest.setModel("anthropic.claude-opus-4-6");
aiRequest.setFiles(List.of(screenshotFile));
Comment on lines +144 to +147
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🌐 Web query:

What are the valid model names for Claude models on Vertex AI in 2026?

💡 Result:

The valid model names (API model IDs) for Claude models on Vertex AI as of April 2026, based on official documentation from Google Cloud and Anthropic, are: Current/latest models (recommended): - claude-opus-4-6 - claude-sonnet-4-6 - claude-haiku-4-5@20251001 - claude-sonnet-4-5@20250929 - claude-opus-4-5@20251101 - claude-opus-4-1@20250805 Legacy/older models (still available but marked deprecated in some docs): - claude-sonnet-4@20250514 - claude-opus-4@20250514 - claude-3-5-haiku@20241022 - claude-3-haiku@20240307 - claude-3-7-sonnet@20250219 Note: Availability may vary by region. Check Vertex AI Model Garden for the most up-to-date list and retirement dates (e.g., some retire no sooner than 2027). No confirmed ID found for newer models like Sonnet 5 "Fennec" on Vertex AI.

Citations:


Remove the "anthropic." prefix from the model identifier.

The model ID should be "claude-opus-4-6" not "anthropic.claude-opus-4-6". The Vertex AI API model IDs for Claude do not include the "anthropic." prefix.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In
`@image_based_actions/src/main/java/com/testsigma/addons/windows/ClickOnImageUsingAi.java`
around lines 144 - 147, The model identifier passed to AIRequest currently
includes the "anthropic." prefix; update the call where AIRequest is created
(the AIRequest instance and its setModel usage) to use "claude-opus-4-6" instead
of "anthropic.claude-opus-4-6" (the block that sets fullPrompt, calls
aiRequest.setModel(...), and attaches files like screenshotFile should be
modified accordingly).


logger.info("Sending AI request...");
String aiResponse = ai.invokeAI(aiRequest);
logger.info("AI response: " + aiResponse);

// ── Step 5: Parse the AI response ──
JsonNode responseNode = parseJson(aiResponse);
if (responseNode == null) {
setErrorMessage("Failed to get the image response from ai (contact to support)" );
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Minor grammatical issue in error message.

The error message contains "contact to support" which should be "contact support".

Proposed fix
-                setErrorMessage("Failed to get the image response from ai (contact to support)" );
+                setErrorMessage("Failed to get the image response from AI (contact support)");
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
setErrorMessage("Failed to get the image response from ai (contact to support)" );
setErrorMessage("Failed to get the image response from AI (contact support)");
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In
`@image_based_actions/src/main/java/com/testsigma/addons/windows/ClickOnImageUsingAi.java`
at line 156, Update the error message passed to setErrorMessage in
ClickOnImageUsingAi (the call currently setting "Failed to get the image
response from ai (contact to support)") to correct the grammar to "Failed to get
the image response from ai (contact support)"; locate the setErrorMessage(...)
invocation in the ClickOnImageUsingAi class/method and replace the string
accordingly.

logger.info("Failed to parse AI response as JSON. Raw response: " + aiResponse);
finalAnnotatedFile = ScreenshotUtils.saveScreenshotToFile(desktopCapture, "ai_click_failed");
ScreenshotUtils.uploadScreenshotToS3(testStepResult, finalAnnotatedFile , logger);
return Result.FAILED;
}

boolean found = responseNode.path("found").asBoolean(false);
if (!found) {
String reason = responseNode.path("description").asText("element not found");
finalAnnotatedFile = ScreenshotUtils.saveScreenshotToFile(desktopCapture, "ai_click_failed");
ScreenshotUtils.uploadScreenshotToS3(testStepResult, finalAnnotatedFile , logger);
setErrorMessage("AI could not locate '" + query + "': " + reason);
return Result.FAILED;
}

int aiX1 = responseNode.path("x1").asInt(0);
int aiY1 = responseNode.path("y1").asInt(0);
int aiX2 = responseNode.path("x2").asInt(0);
int aiY2 = responseNode.path("y2").asInt(0);
int imageWidth = responseNode.path("imageWidth").asInt(0);
int imageHeight = responseNode.path("imageHeight").asInt(0);
int confidence = responseNode.path("confidence").asInt(50);
String description = responseNode.path("description").asText("");

logger.info(String.format(
"AI result — bbox: (%d,%d)-(%d,%d) | AI image dims: %dx%d | confidence: %d | desc: '%s'",
aiX1, aiY1, aiX2, aiY2, imageWidth, imageHeight, confidence, description));

if (imageWidth <= 0 || imageHeight <= 0) {
setErrorMessage(String.format(
"AI returned invalid image dimensions (imageWidth=%d, imageHeight=%d). " +
"Cannot compute scale factor.", imageWidth, imageHeight));
// upload the orignal desktop capture for debugging
finalAnnotatedFile = ScreenshotUtils.saveScreenshotToFile(desktopCapture, "ai_click_failed");
ScreenshotUtils.uploadScreenshotToS3(testStepResult, finalAnnotatedFile , logger);
return Result.FAILED;
}

// ── Step 6: Two-stage coordinate scaling ──
//
// Stage A — AI image → capture pixels
// The AI may receive a down-sampled version of the JPEG we sent.
// imageWidth/imageHeight is what the AI actually analyzed; scale back
// to the physical pixel space of the Robot capture.
double aiToCapX = (double) captureW / imageWidth;
double aiToCapY = (double) captureH / imageHeight;
logger.info(String.format(
"Stage-A scale (AI→capture): %.4fx%.4f (capture %dx%d / ai-image %dx%d)",
aiToCapX, aiToCapY, captureW, captureH, imageWidth, imageHeight));

int capX1 = (int) Math.round(aiX1 * aiToCapX);
int capY1 = (int) Math.round(aiY1 * aiToCapY);
int capX2 = (int) Math.round(aiX2 * aiToCapX);
int capY2 = (int) Math.round(aiY2 * aiToCapY);
int capCX = (capX1 + capX2) / 2;
int capCY = (capY1 + capY2) / 2;
logger.info(String.format(
"Capture-pixel bbox: (%d,%d)-(%d,%d) center: (%d,%d)",
capX1, capY1, capX2, capY2, capCX, capCY));

// Stage B — capture pixels → logical screen coordinates
// Selenium Actions / WinAppDriver expects logical (DIP) coordinates,
// so divide out the HiDPI display scale.
int logicalX1 = (int) Math.round(capX1 / displayScaleX);
int logicalY1 = (int) Math.round(capY1 / displayScaleY);
int logicalX2 = (int) Math.round(capX2 / displayScaleX);
int logicalY2 = (int) Math.round(capY2 / displayScaleY);
int logicalCX = (logicalX1 + logicalX2) / 2;
int logicalCY = (logicalY1 + logicalY2) / 2;
logger.info(String.format(
"Stage-B scale (capture→logical): 1/%.4f × 1/%.4f",
displayScaleX, displayScaleY));
logger.info(String.format(
"Logical screen bbox: (%d,%d)-(%d,%d) center: (%d,%d)",
logicalX1, logicalY1, logicalX2, logicalY2, logicalCX, logicalCY));

// ── Step 7: Annotate the desktop capture (in capture-pixel space) ──
BufferedImage finalAnnotated = drawHighlightRect(
desktopCapture, capX1, capY1, capX2, capY2, capCX, capCY);
finalAnnotatedFile = ScreenshotUtils.saveScreenshotToFile(finalAnnotated, "ai_click_elem_result");
logger.info("Uploading annotated screenshot: " + finalAnnotatedFile.getAbsolutePath());
ScreenshotUtils.uploadScreenshotToS3(testStepResult, finalAnnotatedFile, logger);

// ── Step 8: Click via Robot (logical coordinates) ──
logger.info(String.format(
"Clicking via Robot — logical=(%d,%d) confidence=%d",
logicalCX, logicalCY, confidence));
robot.mouseMove(logicalCX, logicalCY);
robot.mousePress(InputEvent.BUTTON1_DOWN_MASK);
robot.mouseRelease(InputEvent.BUTTON1_DOWN_MASK);

logger.info("Click performed successfully");
setSuccessMessage(String.format(
"Successfully clicked '%s' at logical (%d,%d) | capture bbox (%d,%d)-(%d,%d) | confidence=%d | format=%s | %s",
query, logicalCX, logicalCY,
capX1, capY1, capX2, capY2, confidence, imageFormat, description));
return Result.SUCCESS;

} catch (Exception e) {
logger.info("Exception: " + ExceptionUtils.getStackTrace(e));
setErrorMessage("Failed to click using AI. Error: " + e.getMessage());
return Result.FAILED;
} finally {
deleteQuietly(screenshotFile);
deleteQuietly(finalAnnotatedFile);
}
}

// ─────────────────────────────────────────────────────────────────────────
// Convert any BufferedImage to TYPE_INT_RGB (strips alpha for JPEG output).
// ─────────────────────────────────────────────────────────────────────────
private BufferedImage toRgb(BufferedImage src) {
BufferedImage rgb = new BufferedImage(src.getWidth(), src.getHeight(), BufferedImage.TYPE_INT_RGB);
Graphics2D g = rgb.createGraphics();
g.drawImage(src, 0, 0, null);
g.dispose();
return rgb;
}

// ─────────────────────────────────────────────────────────────────────────
// Draw a green bounding rectangle + green dot at the click center (cx, cy).
// ─────────────────────────────────────────────────────────────────────────
private BufferedImage drawHighlightRect(BufferedImage original,
int x1, int y1, int x2, int y2,
int cx, int cy) {
BufferedImage copy = new BufferedImage(
original.getWidth(), original.getHeight(), BufferedImage.TYPE_INT_RGB);
Graphics2D g = copy.createGraphics();
g.drawImage(original, 0, 0, null);

// Clamp rectangle to image bounds
int rx1 = Math.max(0, x1);
int ry1 = Math.max(0, y1);
int rx2 = Math.min(original.getWidth() - 1, x2);
int ry2 = Math.min(original.getHeight() - 1, y2);

// Green bounding rectangle
g.setColor(Color.MAGENTA);
Comment on lines +293 to +294
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Comment does not match implementation.

The comment says "Green bounding rectangle" but the code uses Color.MAGENTA. Either update the comment or the color to match.

Proposed fix (update comment)
-        // Green bounding rectangle
+        // Magenta bounding rectangle
         g.setColor(Color.MAGENTA);
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
// Green bounding rectangle
g.setColor(Color.MAGENTA);
// Magenta bounding rectangle
g.setColor(Color.MAGENTA);
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In
`@image_based_actions/src/main/java/com/testsigma/addons/windows/ClickOnImageUsingAi.java`
around lines 293 - 294, The inline comment above the color setting in
ClickOnImageUsingAi (the line using g.setColor(Color.MAGENTA)) incorrectly says
"Green bounding rectangle"; update either the comment or the color so they
match: either change the comment to "Magenta bounding rectangle" or change
Color.MAGENTA to Color.GREEN (or Color.GREEN.darker()/brighter as appropriate)
in the paint/draw method where g.setColor is called to ensure the comment and
the visual color are consistent.

g.setStroke(new BasicStroke(HIGHLIGHT_STROKE_WIDTH));
g.drawRect(rx1, ry1, rx2 - rx1, ry2 - ry1);

// Green filled dot at click center
g.setColor(Color.GREEN);
g.fillOval(cx - DOT_RADIUS, cy - DOT_RADIUS, DOT_RADIUS * 2, DOT_RADIUS * 2);

g.dispose();
return copy;
}

// ─────────────────────────────────────────────────────────────────────────
// Extract JSON object from AI response, tolerating leading/trailing text.
// ─────────────────────────────────────────────────────────────────────────
private JsonNode parseJson(String aiResponse) {
try {
String json = aiResponse.trim();
if (json.contains("```")) {
json = json.replaceAll("(?s)```[a-z]*\\s*", "").
replaceAll("```", "").trim();
}
int start = json.indexOf('{');
int end = json.lastIndexOf('}');
if (start != -1 && end != -1 && end > start) {
json = json.substring(start, end + 1);
}
return new ObjectMapper().readTree(json);
} catch (Exception e) {
logger.info("Failed to parse AI JSON: " + e.getMessage() + " | raw: " + aiResponse);
return null;
}
}

private void deleteQuietly(File file) {
if (file != null && file.exists()) {
file.delete();
}
}
}