diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index e543f548..8d4d42d2 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -7,6 +7,12 @@ on: type: string required: true description: 'The environment to build for' + workflow_dispatch: + inputs: + environment: + type: string + required: true + description: 'The environment to build for' permissions: contents: read diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index aa365a41..4d335ccc 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -81,64 +81,64 @@ jobs: dist/*.dmg dist/*.dmg.blockmap - # build-windows: - # runs-on: ubuntu-latest - # environment: ${{ github.event.release.target_commitish == 'main' && 'production' || 'develop' }} + build-windows: + runs-on: ubuntu-latest + environment: ${{ github.event.release.target_commitish == 'main' && 'production' || 'develop' }} - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 + steps: + - name: Checkout repository + uses: actions/checkout@v4 - # - name: Set up Bun - # uses: oven-sh/setup-bun@v1 + - name: Set up Bun + uses: oven-sh/setup-bun@v1 - # - name: Install dependencies and MinGW-w64 - # run: | - # sudo apt-get update && sudo apt-get install -y mingw-w64 - # bun install + - name: Install dependencies and MinGW-w64 + run: | + sudo apt-get update && sudo apt-get install -y mingw-w64 + bun install - # - name: Set up Rust - # uses: dtolnay/rust-toolchain@stable - # with: - # toolchain: stable - # targets: 'x86_64-pc-windows-gnu' + - name: Set up Rust + uses: dtolnay/rust-toolchain@stable + with: + toolchain: stable + targets: 'x86_64-pc-windows-gnu' - # - name: Set up environment - # run: | - # echo "VITE_AUTH0_DOMAIN=\"${{ secrets.VITE_AUTH0_DOMAIN }}\"" >> .env - # echo "VITE_AUTH0_CLIENT_ID=\"${{ secrets.VITE_AUTH0_CLIENT_ID }}\"" >> .env - # echo "VITE_AUTH0_AUDIENCE=\"${{ secrets.VITE_AUTH0_AUDIENCE }}\"" >> .env - # echo "VITE_POSTHOG_API_KEY=\"${{ secrets.VITE_POSTHOG_API_KEY }}\"" >> .env - # echo "VITE_POSTHOG_HOST=\"${{ secrets.VITE_POSTHOG_HOST }}\"" >> .env - # echo "VITE_GRPC_BASE_URL=\"${{ vars.VITE_GRPC_BASE_URL }}\"" >> .env - # echo "VITE_UPDATER_BUCKET=\"${{ vars.VITE_UPDATER_BUCKET }}\"" >> .env - # echo "VITE_SENTRY_DSN=\"${{ vars.VITE_SENTRY_DSN }}\"" >> .env - # echo "VITE_SENTRY_ENV=\"${{ vars.VITE_SENTRY_ENV }}\"" >> .env - # echo "VITE_SENTRY_TRACES_SAMPLE_RATE=\"${{ vars.VITE_SENTRY_TRACES_SAMPLE_RATE }}\"" >> .env - # echo "VITE_SENTRY_PROFILES_SAMPLE_RATE=\"${{ vars.VITE_SENTRY_PROFILES_SAMPLE_RATE }}\"" >> .env - # echo "VITE_ITO_VERSION=\"${GITHUB_REF#refs/tags/v}\"" >> .env - # echo "ITO_ENV=\"${{ github.event.release.target_commitish == 'main' && 'prod' || 'dev' }}\"" >> .env - # echo "VITE_ITO_ENV=\"${{ github.event.release.target_commitish == 'main' && 'prod' || 'dev' }}\"" >> .env - # echo "GH_TOKEN=\"${{ secrets.GITHUB_TOKEN }}\"" >> .env - # echo "GRPC_BASE_URL=\"${{ vars.GRPC_BASE_URL }}\"" >> .env - # echo "Created .env file:" - # cat .env - - # - name: Build and package Windows application (unsigned) - # env: - # SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} - # run: ./build-app.sh windows - - # - name: Upload unsigned Windows artifacts for signing - # uses: actions/upload-artifact@v4 - # with: - # name: Windows-Unsigned-Artifacts - # path: | - # dist/*.exe - # dist/*.yml - # dist/*.nsis.7z - # dist/*.zip - # dist/*.blockmap + - name: Set up environment + run: | + echo "VITE_AUTH0_DOMAIN=\"${{ secrets.VITE_AUTH0_DOMAIN }}\"" >> .env + echo "VITE_AUTH0_CLIENT_ID=\"${{ secrets.VITE_AUTH0_CLIENT_ID }}\"" >> .env + echo "VITE_AUTH0_AUDIENCE=\"${{ secrets.VITE_AUTH0_AUDIENCE }}\"" >> .env + echo "VITE_POSTHOG_API_KEY=\"${{ secrets.VITE_POSTHOG_API_KEY }}\"" >> .env + echo "VITE_POSTHOG_HOST=\"${{ secrets.VITE_POSTHOG_HOST }}\"" >> .env + echo "VITE_GRPC_BASE_URL=\"${{ vars.VITE_GRPC_BASE_URL }}\"" >> .env + echo "VITE_UPDATER_BUCKET=\"${{ vars.VITE_UPDATER_BUCKET }}\"" >> .env + echo "VITE_SENTRY_DSN=\"${{ vars.VITE_SENTRY_DSN }}\"" >> .env + echo "VITE_SENTRY_ENV=\"${{ vars.VITE_SENTRY_ENV }}\"" >> .env + echo "VITE_SENTRY_TRACES_SAMPLE_RATE=\"${{ vars.VITE_SENTRY_TRACES_SAMPLE_RATE }}\"" >> .env + echo "VITE_SENTRY_PROFILES_SAMPLE_RATE=\"${{ vars.VITE_SENTRY_PROFILES_SAMPLE_RATE }}\"" >> .env + echo "VITE_ITO_VERSION=\"${GITHUB_REF#refs/tags/v}\"" >> .env + echo "ITO_ENV=\"${{ github.event.release.target_commitish == 'main' && 'prod' || 'dev' }}\"" >> .env + echo "VITE_ITO_ENV=\"${{ github.event.release.target_commitish == 'main' && 'prod' || 'dev' }}\"" >> .env + echo "GH_TOKEN=\"${{ secrets.GITHUB_TOKEN }}\"" >> .env + echo "GRPC_BASE_URL=\"${{ vars.GRPC_BASE_URL }}\"" >> .env + echo "Created .env file:" + cat .env + + - name: Build and package Windows application (unsigned) + env: + SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} + run: ./build-app.sh windows + + - name: Upload unsigned Windows artifacts for signing + uses: actions/upload-artifact@v4 + with: + name: Windows-Unsigned-Artifacts + path: | + dist/*.exe + dist/*.yml + dist/*.nsis.7z + dist/*.zip + dist/*.blockmap # sign-windows: # runs-on: windows-latest @@ -232,7 +232,7 @@ jobs: upload-to-s3: runs-on: ubuntu-latest - needs: [build-mac] + needs: [build-mac, build-windows] environment: ${{ github.event.release.target_commitish == 'main' && 'production' || 'develop' }} steps: @@ -242,11 +242,11 @@ jobs: name: Mac-Build-Artifacts path: mac-dist - # - name: Download Windows Build Artifacts - # uses: actions/download-artifact@v4 - # with: - # name: Windows-Build-Artifacts - # path: windows-dist + - name: Download Windows Build Artifacts + uses: actions/download-artifact@v4 + with: + name: Windows-Unsigned-Artifacts + path: windows-dist - name: Configure AWS credentials (OIDC) uses: aws-actions/configure-aws-credentials@v2 @@ -263,7 +263,7 @@ jobs: echo "Listing existing files in root of releases/" aws s3 ls "$BUCKET/" | grep -vE '/$' | awk '{print $4}' > existing_files.txt - echo "Copying Mac artifacts" + echo "Combining Mac and Windows artifacts" mkdir -p combined-dist # Copy Mac artifacts @@ -271,10 +271,10 @@ jobs: cp -r mac-dist/* combined-dist/ fi - # # Copy Windows artifacts - # if [ -d "windows-dist" ]; then - # cp -r windows-dist/* combined-dist/ - # fi + # Copy Windows artifacts + if [ -d "windows-dist" ]; then + cp -r windows-dist/* combined-dist/ + fi echo "Files to upload:" ls -la combined-dist/ @@ -285,7 +285,11 @@ jobs: -name '*universal-mac.zip' -o \ -name '*universal-mac.zip.blockmap' -o \ -name '*.dmg' -o \ - -name '*.dmg.blockmap' \ + -name '*.dmg.blockmap' -o \ + -name '*.exe' -o \ + -name '*.nsis.7z' -o \ + -name '*.zip' -o \ + -name '*.blockmap' \ \) | xargs -I{} basename {} > uploaded_root_files.txt echo "Uploading full combined dist to versioned folder: $BUCKET/${{ github.ref_name }}/" @@ -297,7 +301,11 @@ jobs: $(find combined-dist -maxdepth 1 -name '*universal-mac.zip') \ $(find combined-dist -maxdepth 1 -name '*universal-mac.zip.blockmap') \ $(find combined-dist -maxdepth 1 -name '*.dmg') \ - $(find combined-dist -maxdepth 1 -name '*.dmg.blockmap') + $(find combined-dist -maxdepth 1 -name '*.dmg.blockmap') \ + $(find combined-dist -maxdepth 1 -name '*.exe') \ + $(find combined-dist -maxdepth 1 -name '*.nsis.7z') \ + $(find combined-dist -maxdepth 1 -name '*.zip') \ + $(find combined-dist -maxdepth 1 -name '*.blockmap') do if [ -f "$FILE" ]; then aws s3 cp "$FILE" $BUCKET/ @@ -335,7 +343,7 @@ jobs: aws cloudfront create-invalidation \ --distribution-id "${{ vars.CLOUDFRONT_DISTRIBUTION_ID }}" \ - --paths "/*.yml" "/*.dmg" "/*.zip" "/*.blockmap" + --paths "/*.yml" "/*.dmg" "/*.exe" "/*.zip" "/*.blockmap" - name: Upload installers to GitHub Release uses: softprops/action-gh-release@v1 @@ -343,3 +351,4 @@ jobs: tag_name: ${{ github.ref_name }} files: | mac-dist/*.dmg + windows-dist/*.exe diff --git a/.gitignore b/.gitignore index b912165d..2b369968 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ dist/* .DS_Store native/**/target native/target-rust-analyzer +native/**/.build out.json tsconfig.node.tsbuildinfo server/deploy-dev.sh diff --git a/README.md b/README.md index fa3a7b76..a3b1a8f1 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ ### Installation -1. **Download the latest release** from [heyito.ai](https://www.heyito.ai/) or the [GitHub releases page](https://github.com/demox-labs/ito/releases) +1. **Download the latest release** from [heyito.ai](https://www.heyito.ai/) or the [GitHub releases page](https://github.com/heyito/ito/releases) 2. **Install the application**: - **macOS**: Open the `.dmg` file and drag Ito to Applications diff --git a/app/components/home/contents/settings/AdvancedSettingsContent.tsx b/app/components/home/contents/settings/AdvancedSettingsContent.tsx index 617f4121..28083e44 100644 --- a/app/components/home/contents/settings/AdvancedSettingsContent.tsx +++ b/app/components/home/contents/settings/AdvancedSettingsContent.tsx @@ -3,6 +3,7 @@ import { useAdvancedSettingsStore, } from '@/app/store/useAdvancedSettingsStore' import { ChangeEvent, useEffect, useRef, useState } from 'react' +import { useWindowContext } from '@/app/components/window/WindowContext' type LlmSettingConfig = { name: keyof LlmSettings @@ -188,9 +189,12 @@ export default function AdvancedSettingsContent() { const { llm, grammarServiceEnabled, + macosAccessibilityContextEnabled, setLlmSettings, setGrammarServiceEnabled, + setMacosAccessibilityContextEnabled, } = useAdvancedSettingsStore() + const windowContext = useWindowContext() const debounceRef = useRef(null) useEffect(() => { @@ -204,16 +208,20 @@ export default function AdvancedSettingsContent() { function scheduleAdvancedSettingsUpdate( nextLlm: LlmSettings, nextGrammarEnabled: boolean, + nextMacosAccessibilityEnabled: boolean, ) { if (debounceRef.current) { clearTimeout(debounceRef.current) } debounceRef.current = setTimeout(async () => { - await window.api.updateAdvancedSettings({ + const settingsToSave = { llm: nextLlm, grammarServiceEnabled: nextGrammarEnabled, - }) + macosAccessibilityContextEnabled: nextMacosAccessibilityEnabled, + } + console.log('[AdvancedSettings] Saving settings...') + await window.api.updateAdvancedSettings(settingsToSave) }, 1000) } @@ -224,13 +232,29 @@ export default function AdvancedSettingsContent() { const newValue = e.target.value const updatedLlm = { ...llm, [config.name]: newValue } setLlmSettings({ [config.name]: newValue }) - scheduleAdvancedSettingsUpdate(updatedLlm, grammarServiceEnabled) + scheduleAdvancedSettingsUpdate( + updatedLlm, + grammarServiceEnabled, + macosAccessibilityContextEnabled, + ) } function handleGrammarServiceToggle(e: ChangeEvent) { const enabled = e.target.checked setGrammarServiceEnabled(enabled) - scheduleAdvancedSettingsUpdate(llm, enabled) + scheduleAdvancedSettingsUpdate( + llm, + enabled, + macosAccessibilityContextEnabled, + ) + } + + function handleMacosAccessibilityContextToggle( + e: ChangeEvent, + ) { + const enabled = e.target.checked + setMacosAccessibilityContextEnabled(enabled) + scheduleAdvancedSettingsUpdate(llm, grammarServiceEnabled, enabled) } return ( @@ -274,6 +298,31 @@ export default function AdvancedSettingsContent() { + + {windowContext?.window?.platform === 'darwin' && ( +
+

+ Context +

+ +
+ )} ) diff --git a/app/store/useAdvancedSettingsStore.ts b/app/store/useAdvancedSettingsStore.ts index f33a423b..3ade50be 100644 --- a/app/store/useAdvancedSettingsStore.ts +++ b/app/store/useAdvancedSettingsStore.ts @@ -16,8 +16,10 @@ export interface LlmSettings { interface AdvancedSettingsState { llm: LlmSettings grammarServiceEnabled: boolean + macosAccessibilityContextEnabled: boolean setLlmSettings: (settings: Partial) => void setGrammarServiceEnabled: (enabled: boolean) => void + setMacosAccessibilityContextEnabled: (enabled: boolean) => void } // Initialize from electron store @@ -40,6 +42,8 @@ const getInitialState = () => { }, grammarServiceEnabled: storedAdvancedSettings.grammarServiceEnabled ?? false, + macosAccessibilityContextEnabled: + storedAdvancedSettings.macosAccessibilityContextEnabled ?? false, } } @@ -79,5 +83,12 @@ export const useAdvancedSettingsStore = create(set => { return partialState }) }, + setMacosAccessibilityContextEnabled: (enabled: boolean) => { + set(() => { + const partialState = { macosAccessibilityContextEnabled: enabled } + syncToStore(partialState) + return partialState + }) + }, } }) diff --git a/build-binaries.sh b/build-binaries.sh index ae8cbe4a..6f4df703 100755 --- a/build-binaries.sh +++ b/build-binaries.sh @@ -61,6 +61,20 @@ build_native_workspace() { print_info "Creating symlink: x64-apple-darwin -> x86_64-apple-darwin" ln -sfn x86_64-apple-darwin target/x64-apple-darwin fi + + # Build Swift packages + print_info "Building Swift packages..." + cd cursor-context + if [ "$mac_target" = "aarch64-apple-darwin" ]; then + swift build -c release --arch arm64 + else + swift build -c release --arch x86_64 + fi + # Copy built binary to Rust target directory and re-sign for code signing compatibility + cp .build/release/cursor-context "../target/$mac_target/release/" + xattr -cr "../target/$mac_target/release/cursor-context" + codesign --force --sign - "../target/$mac_target/release/cursor-context" 2>/dev/null || true + cd .. fi # --- Windows Build --- diff --git a/lib/main/context/ContextGrabber.ts b/lib/main/context/ContextGrabber.ts index ef1cb6b7..9b715573 100644 --- a/lib/main/context/ContextGrabber.ts +++ b/lib/main/context/ContextGrabber.ts @@ -9,6 +9,7 @@ import { import { canGetContextFromCurrentApp } from '../../utils/applicationDetection' import log from 'electron-log' import { timingCollector, TimingEventName } from '../timing/TimingCollector' +import { macOSAccessibilityContextProvider } from '../../media/macOSAccessibilityContextProvider' export interface ContextData { vocabularyWords: string[] @@ -92,12 +93,48 @@ export class ContextGrabber { return '' } + const { macosAccessibilityContextEnabled } = getAdvancedSettings() + + // Try accessibility API first if enabled + if ( + process.platform === 'darwin' && + macosAccessibilityContextEnabled && + macOSAccessibilityContextProvider.isRunning() + ) { + try { + const result = await timingCollector.timeAsync( + TimingEventName.CURSOR_CONTEXT_GATHER, + async () => + await macOSAccessibilityContextProvider.getCursorContext({ + maxCharsBefore: 1000, + maxCharsAfter: 1000, + timeout: 500, + debug: false, + }), + ) + + if (result.success && result.context?.selectedText) { + console.log( + '[ContextGrabber] Got selected text via accessibility API', + ) + return result.context.selectedText.trim() + } + } catch (error) { + console.log( + '[ContextGrabber] Accessibility API failed, falling back to keyboard:', + error, + ) + } + } + + // Fallback to keyboard-based method + console.log('[ContextGrabber] Using keyboard method for selected text') try { const text = await timingCollector.timeAsync( TimingEventName.SELCTED_TEXT_GATHER, async () => await getSelectedTextString(), ) - console.log('[ContextGrabber] Selected text:', text) + console.log('[ContextGrabber] Selected text from keyboard:', text) return text && text.trim().length > 0 ? text : '' } catch (error) { log.error('[ContextGrabber] Error getting context text:', error) @@ -115,6 +152,40 @@ export class ContextGrabber { public async getCursorContextForGrammar( contextLength: number = 4, ): Promise { + const { macosAccessibilityContextEnabled } = getAdvancedSettings() + + // Try accessibility API first if enabled + if ( + process.platform === 'darwin' && + macosAccessibilityContextEnabled && + macOSAccessibilityContextProvider.isRunning() + ) { + try { + const result = await macOSAccessibilityContextProvider.getCursorContext( + { + maxCharsBefore: contextLength, + maxCharsAfter: 0, + timeout: 500, + debug: false, + }, + ) + + if (result.success && result.context?.textBefore) { + console.log( + '[ContextGrabber] Got cursor context via accessibility API', + ) + return result.context.textBefore + } + } catch (error) { + console.log( + '[ContextGrabber] Accessibility API failed, falling back to keyboard:', + error, + ) + } + } + + // Fallback to keyboard-based method + console.log('[ContextGrabber] Using keyboard method for cursor context') try { const canGetContext = await canGetContextFromCurrentApp() diff --git a/lib/main/itoSessionManager.test.ts b/lib/main/itoSessionManager.test.ts index 76e3fdf6..55fac22b 100644 --- a/lib/main/itoSessionManager.test.ts +++ b/lib/main/itoSessionManager.test.ts @@ -37,7 +37,8 @@ const mockItoStreamController = { }), ), setMode: mock(), - sendConfigUpdate: mock(() => Promise.resolve()), + getCurrentMode: mock(() => ItoMode.TRANSCRIBE), + scheduleConfigUpdate: mock(() => Promise.resolve()), getAudioDurationMs: mock(() => 1000), endInteraction: mock(), cancelTranscription: mock(), @@ -67,6 +68,29 @@ mock.module('./interactions/InteractionManager', () => ({ })) const mockContextGrabber = { + gatherContext: mock(() => + Promise.resolve({ + windowTitle: 'Test Window', + appName: 'Test App', + contextText: 'Test context', + vocabularyWords: ['test', 'word'], + advancedSettings: { + llm: { + asrModel: 'whisper-1', + asrProvider: 'openai', + asrPrompt: '', + noSpeechThreshold: 0.5, + llmProvider: 'openai', + llmModel: 'gpt-4', + llmTemperature: 0.7, + transcriptionPrompt: '', + editingPrompt: '', + }, + grammarServiceEnabled: false, + macosAccessibilityContextEnabled: true, + }, + }), + ), getCursorContextForGrammar: mock(() => Promise.resolve('test context')), } mock.module('./context/ContextGrabber', () => ({ @@ -164,7 +188,7 @@ describe('itoSessionManager', () => { // Wait for background context fetch await new Promise(resolve => setTimeout(resolve, 50)) - expect(mockItoStreamController.sendConfigUpdate).toHaveBeenCalled() + expect(mockItoStreamController.scheduleConfigUpdate).toHaveBeenCalled() }) test('should fetch cursor context when grammar is enabled', async () => { @@ -405,7 +429,7 @@ describe('itoSessionManager', () => { }) test('should handle context fetch error gracefully', async () => { - mockItoStreamController.sendConfigUpdate.mockRejectedValueOnce( + mockItoStreamController.scheduleConfigUpdate.mockRejectedValueOnce( new Error('Context fetch failed'), ) diff --git a/lib/main/itoSessionManager.ts b/lib/main/itoSessionManager.ts index 6c44b200..b8530c4a 100644 --- a/lib/main/itoSessionManager.ts +++ b/lib/main/itoSessionManager.ts @@ -67,8 +67,15 @@ export class ItoSessionManager { } private async fetchAndSendContext() { - // This builds the full config (window context, selected text, vocabulary, settings) - await itoStreamController.sendConfigUpdate() + console.log('[itoSessionManager] Gathering context...') + + // Gather all context data (window, app, selected text, vocabulary, settings) + const context = await contextGrabber.gatherContext( + itoStreamController.getCurrentMode(), + ) + + // Send the gathered context to the stream controller + await itoStreamController.scheduleConfigUpdate(context) // Fetch cursor context for grammar rules only if grammar service is enabled const { grammarServiceEnabled } = getAdvancedSettings() diff --git a/lib/main/itoStreamController.test.ts b/lib/main/itoStreamController.test.ts index a100b6bb..c40410f6 100644 --- a/lib/main/itoStreamController.test.ts +++ b/lib/main/itoStreamController.test.ts @@ -62,6 +62,8 @@ const mockContextGrabber = { transcriptionPrompt: '', editingPrompt: '', }, + grammarServiceEnabled: false, + macosAccessibilityContextEnabled: true, }, }), ), @@ -189,11 +191,10 @@ describe('ItoStreamController', () => { await controller.initialize(ItoMode.TRANSCRIBE) mockAudioStreamManager.isCurrentlyStreaming.mockReturnValue(true) - await controller.sendConfigUpdate() + const mockContext = await mockContextGrabber.gatherContext() + await controller.scheduleConfigUpdate(mockContext) - expect(mockContextGrabber.gatherContext).toHaveBeenCalledWith( - ItoMode.TRANSCRIBE, - ) + expect(mockContextGrabber.gatherContext).toHaveBeenCalled() }) test('should warn when sending config without active stream', async () => { @@ -202,9 +203,11 @@ describe('ItoStreamController', () => { mockAudioStreamManager.isCurrentlyStreaming.mockReturnValue(false) - await controller.sendConfigUpdate() + const mockContext = await mockContextGrabber.gatherContext() + await controller.scheduleConfigUpdate(mockContext) - expect(mockContextGrabber.gatherContext).not.toHaveBeenCalled() + // Should not be called again since we already called it to get mockContext + expect(mockContextGrabber.gatherContext).toHaveBeenCalledTimes(1) }) test('should end interaction successfully', async () => { diff --git a/lib/main/itoStreamController.ts b/lib/main/itoStreamController.ts index c2851c47..5279daaf 100644 --- a/lib/main/itoStreamController.ts +++ b/lib/main/itoStreamController.ts @@ -9,7 +9,7 @@ import { import { create } from '@bufbuild/protobuf' import { grpcClient } from '../clients/grpcClient' import { AudioStreamManager } from './audio/AudioStreamManager' -import { contextGrabber } from './context/ContextGrabber' +import { ContextData } from './context/ContextGrabber' import log from 'electron-log' import { timingCollector, TimingEventName } from './timing/TimingCollector' import { interactionManager } from './interactions/InteractionManager' @@ -85,6 +85,10 @@ export class ItoStreamController { } } + public getCurrentMode(): ItoMode { + return this.currentMode + } + public setMode(mode: ItoMode) { if (!this.audioStreamManager.isCurrentlyStreaming()) { log.warn('[ItoStreamController] Cannot change mode - no active stream') @@ -98,14 +102,14 @@ export class ItoStreamController { this.sendModeUpdate(mode) } - public async sendConfigUpdate() { + public scheduleConfigUpdate(context: ContextData) { if (!this.audioStreamManager.isCurrentlyStreaming()) { log.warn('[ItoStreamController] Cannot send config - no active stream') return } console.log('[ItoStreamController] Queueing config update') - const config = await this.buildStreamConfig() + const config = this.buildStreamConfig(context) this.configQueue.push(config) } @@ -205,11 +209,9 @@ export class ItoStreamController { } } - private async buildStreamConfig(): Promise { - // Gather all config data using ContextGrabber - const context = await contextGrabber.gatherContext(this.currentMode) + private buildStreamConfig(context: ContextData): TranscribeStreamRequest { const interactionId = interactionManager.getCurrentInteractionId() - + // Build gRPC config message from the provided context data return create(TranscribeStreamRequestSchema, { payload: { case: 'config', diff --git a/lib/main/main.ts b/lib/main/main.ts index e736b768..12bf95aa 100644 --- a/lib/main/main.ts +++ b/lib/main/main.ts @@ -23,6 +23,7 @@ import { checkAccessibilityPermission } from '../utils/crossPlatform' import mainStore, { initializeStore } from './store' import { STORE_KEYS } from '../constants/store-keys' import { selectedTextReaderService } from '../media/selected-text-reader' +import { macOSAccessibilityContextProvider } from '../media/macOSAccessibilityContextProvider' import { voiceInputService } from './voiceInputService' import { initializeMicrophoneSelection } from '../media/microphoneSetUp' import { validateStoredTokens, ensureValidTokens } from '../auth/events' @@ -131,6 +132,12 @@ app.whenReady().then(async () => { console.log('Starting selected text reader service.') selectedTextReaderService.initialize() + // Initialize cursor context provider (macOS only for now) + if (process.platform === 'darwin') { + console.log('Starting cursor context provider.') + macOSAccessibilityContextProvider.initialize() + } + // Initialize microphone selection to prefer built-in microphone await initializeMicrophoneSelection() diff --git a/lib/main/store.ts b/lib/main/store.ts index f84b185c..158dca0d 100644 --- a/lib/main/store.ts +++ b/lib/main/store.ts @@ -70,6 +70,7 @@ export interface AuthStore { export interface AdvancedSettings { llm: LlmSettings grammarServiceEnabled: boolean + macosAccessibilityContextEnabled: boolean } interface AppStore { @@ -142,6 +143,7 @@ export const defaultValues: AppStore = { auth: { user: null, tokens: null, state: createNewAuthState() }, advancedSettings: { grammarServiceEnabled: false, + macosAccessibilityContextEnabled: false, llm: { asrProvider: DEFAULT_ADVANCED_SETTINGS.asrProvider, asrModel: DEFAULT_ADVANCED_SETTINGS.asrModel, diff --git a/lib/main/timing/TimingCollector.ts b/lib/main/timing/TimingCollector.ts index 0bf3aeab..673019fd 100644 --- a/lib/main/timing/TimingCollector.ts +++ b/lib/main/timing/TimingCollector.ts @@ -22,6 +22,7 @@ export enum TimingEventName { SELCTED_TEXT_GATHER = 'selected_text_gather', WINDOW_CONTEXT_GATHER = 'window_context_gather', GRAMMAR_SERVICE = 'grammar_service', + CURSOR_CONTEXT_GATHER = 'cursor_context_gather', // Output TEXT_WRITER = 'text_writer', diff --git a/lib/media/IAccessibilityContextProvider.ts b/lib/media/IAccessibilityContextProvider.ts new file mode 100644 index 00000000..1efabbbf --- /dev/null +++ b/lib/media/IAccessibilityContextProvider.ts @@ -0,0 +1,14 @@ +import type { + CursorContextOptions, + CursorContextResult, +} from '../types/cursorContext' + +export interface IAccessibilityContextProvider { + initialize(): void + + shutdown(): void + + isRunning(): boolean + + getCursorContext(options?: CursorContextOptions): Promise +} diff --git a/lib/media/macOSAccessibilityContextProvider.ts b/lib/media/macOSAccessibilityContextProvider.ts new file mode 100644 index 00000000..800033fe --- /dev/null +++ b/lib/media/macOSAccessibilityContextProvider.ts @@ -0,0 +1,113 @@ +/** + * macOS Accessibility Context Provider Implementation + * + * Uses a one-shot Swift binary that retrieves cursor context + * using macOS NSAccessibility/AXUIElement APIs. + */ + +import { execFile } from 'child_process' +import { platform, arch } from 'os' +import { getNativeBinaryPath } from './native-interface' +import log from 'electron-log' +import type { IAccessibilityContextProvider } from './IAccessibilityContextProvider' +import type { + CursorContextOptions, + CursorContextResult, +} from '../types/cursorContext' + +const NATIVE_MODULE_NAME = 'cursor-context' +export class MacOSAccessibilityContextProvider + implements IAccessibilityContextProvider +{ + #binaryPath: string | null = null + + constructor() {} + + public initialize(): void { + const binaryPath = getNativeBinaryPath(NATIVE_MODULE_NAME) + if (!binaryPath) { + const error = new Error( + `Cannot determine ${NATIVE_MODULE_NAME} binary path for platform ${platform()} and arch ${arch()}`, + ) + log.error('[MacOSAccessibilityContextProvider]', error.message) + throw error + } + + this.#binaryPath = binaryPath + console.log( + `[MacOSAccessibilityContextProvider] Initialized with binary path: ${binaryPath}`, + ) + } + + public shutdown(): void { + // No-op for one-shot process + } + + public isRunning(): boolean { + return this.#binaryPath !== null + } + + public async getCursorContext( + options: CursorContextOptions, + ): Promise { + if (!this.#binaryPath) { + throw new Error('Provider not initialized. Call initialize() first.') + } + + return new Promise((resolve, reject) => { + const args = [ + '--before', + String(options.maxCharsBefore), + '--after', + String(options.maxCharsAfter), + ] + + // Enable debug logging if requested + if (options.debug) { + args.push('--debug') + } + + execFile( + this.#binaryPath!, + args, + { timeout: options.timeout }, + (error, stdout, stderr) => { + if (error) { + log.error( + '[MacOSAccessibilityContextProvider] execFile error:', + error, + ) + reject(error) + return + } + + if (stderr) { + console.log( + '[MacOSAccessibilityContextProvider] stderr:', + stderr.trim(), + ) + } + + try { + const result: CursorContextResult = JSON.parse(stdout.trim()) + console.log( + '[MacOSAccessibilityContextProvider] Retrieved cursor context:', + result, + ) + resolve(result) + } catch (parseError) { + log.error( + '[MacOSAccessibilityContextProvider] Failed to parse JSON:', + parseError, + ) + reject(new Error('Failed to parse response from native binary')) + } + }, + ) + }) + } +} + +// Export singleton instance +export const macOSAccessibilityContextProvider = + new MacOSAccessibilityContextProvider() diff --git a/lib/types/cursorContext.ts b/lib/types/cursorContext.ts new file mode 100644 index 00000000..b99f29e8 --- /dev/null +++ b/lib/types/cursorContext.ts @@ -0,0 +1,88 @@ +/** + * Cursor Context Types + * + * Defines types for retrieving text surrounding the cursor position + * using accessibility APIs (NSAccessibility on macOS, UIAutomation on Windows) + */ + +/** + * Position of the cursor within a text field + */ +export interface CursorPosition { + /** Character offset from start of text */ + offset: number + /** Line number (0-indexed) */ + line?: number + /** Column number (0-indexed) */ + column?: number +} + +/** + * Range of text within a text field + */ +export interface TextRange { + /** Start position (character offset) */ + start: number + /** End position (character offset) */ + end: number + /** Length of the range */ + length: number +} + +/** + * Text content surrounding the cursor with metadata + */ +export interface CursorContext { + /** Text before the cursor */ + textBefore: string + /** Text after the cursor */ + textAfter: string + /** Currently selected/highlighted text, if any */ + selectedText: string + /** Current cursor position */ + cursorPosition: CursorPosition + /** Selection range, if text is selected */ + selectionRange?: TextRange + /** Whether the text was truncated due to length limits */ + truncated: boolean + /** Total character count in the text field */ + totalLength: number + /** Timestamp when context was captured */ + timestamp: string +} + +/** + * Complete cursor context result including success/error status + */ +export interface CursorContextResult { + success: boolean + context?: CursorContext + error?: string + /** Method used to retrieve context (for debugging/telemetry) */ + method: 'accessibility' | 'ocr' | 'clipboard' | 'keyboard' +} + +/** + * Options for retrieving cursor context + */ +export interface CursorContextOptions { + /** + * Maximum characters to retrieve before cursor + */ + maxCharsBefore: number + + /** + * Maximum characters to retrieve after cursor + */ + maxCharsAfter: number + + /** + * Timeout in milliseconds + */ + timeout: number + + /** + * Enable debug logging to stderr + */ + debug: boolean +} diff --git a/lib/utils/applicationDetection.ts b/lib/utils/applicationDetection.ts index 2cdf1a4b..79e681fc 100644 --- a/lib/utils/applicationDetection.ts +++ b/lib/utils/applicationDetection.ts @@ -65,6 +65,30 @@ const TERMINAL_APPS = new Set([ 'yakuake', ]) +const AXApiNotSupportedApps = new Set([ + 'visual studio code', + 'visual studio code - insiders', + 'code', + 'code - insiders', + 'visual studio', + 'visual studio 2022', + 'visual studio 2019', +]) + +export async function canGetContextWithAccessibilityApis(): Promise { + try { + const window = await getActiveWindow() + if (!window?.appName) { + return false // Default to disallowing context if we can't determine + } + const lowerAppName = window.appName.toLowerCase() + return !AXApiNotSupportedApps.has(lowerAppName) + } catch (error) { + console.error('Failed to get active window:', error) + return false // Default to not allowing context on error + } +} + export function isTerminalApplication(appName: string): boolean { const lowerAppName = appName.toLowerCase() return TERMINAL_APPS.has(lowerAppName) diff --git a/native/cursor-context/Package.resolved b/native/cursor-context/Package.resolved new file mode 100644 index 00000000..d38abd8e --- /dev/null +++ b/native/cursor-context/Package.resolved @@ -0,0 +1,14 @@ +{ + "pins" : [ + { + "identity" : "swift-argument-parser", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-argument-parser", + "state" : { + "revision" : "cdd0ef3755280949551dc26dee5de9ddeda89f54", + "version" : "1.6.2" + } + } + ], + "version" : 2 +} diff --git a/native/cursor-context/Package.swift b/native/cursor-context/Package.swift new file mode 100644 index 00000000..f2636811 --- /dev/null +++ b/native/cursor-context/Package.swift @@ -0,0 +1,21 @@ +// swift-tools-version:5.7 +import PackageDescription + +let package = Package( + name: "cursor-context", + platforms: [.macOS(.v11)], + dependencies: [ + .package(url: "https://github.com/apple/swift-argument-parser", from: "1.2.0"), + ], + targets: [ + .executableTarget( + name: "cursor-context", + dependencies: [ + .product(name: "ArgumentParser", package: "swift-argument-parser"), + ], + linkerSettings: [ + .linkedFramework("AppKit"), + ] + ) + ] +) diff --git a/native/cursor-context/Sources/cursor-context/CLI.swift b/native/cursor-context/Sources/cursor-context/CLI.swift new file mode 100644 index 00000000..2951ac89 --- /dev/null +++ b/native/cursor-context/Sources/cursor-context/CLI.swift @@ -0,0 +1,63 @@ +import ArgumentParser +import Foundation + +struct CursorContextCLI: ParsableCommand { + static let configuration = CommandConfiguration( + commandName: "cursor-context", + abstract: "Extract cursor context from the focused text element", + discussion: """ + This tool uses macOS Accessibility APIs to extract text around the cursor position + in the currently focused text field or editor. + """ + ) + + @Option(name: .long, help: "Maximum characters to capture before the cursor") + var before: Int = 1000 + + @Option(name: .long, help: "Maximum characters to capture after the cursor") + var after: Int = 1000 + + @Option(name: .long, help: "Delay in seconds before capturing context") + var delay: Int = 0 + + @Flag(name: .long, help: "Enable debug logging to stderr") + var debug: Bool = false + + mutating func run() throws { + // Set global debug flag + DEBUG_LOG = debug + + // Wait for specified delay if provided + if delay > 0 { + if debug { + fputs("Waiting \(delay) second(s) before capturing...\n", stderr) + } + Thread.sleep(forTimeInterval: TimeInterval(delay)) + } + + // Get cursor context with specified parameters + let result = getCursorContext(maxCharsBefore: before, maxCharsAfter: after) + + // Encode and print JSON result + let encoder = JSONEncoder() + encoder.outputFormatting = [.withoutEscapingSlashes] + + guard let jsonData = try? encoder.encode(result), + let jsonString = String(data: jsonData, encoding: .utf8) else { + throw CLIError.encodingFailed + } + + print(jsonString) + } +} + +enum CLIError: Error, CustomStringConvertible { + case encodingFailed + + var description: String { + switch self { + case .encodingFailed: + return "Failed to encode result as JSON" + } + } +} diff --git a/native/cursor-context/Sources/cursor-context/main.swift b/native/cursor-context/Sources/cursor-context/main.swift new file mode 100644 index 00000000..9aaba997 --- /dev/null +++ b/native/cursor-context/Sources/cursor-context/main.swift @@ -0,0 +1,692 @@ +import AppKit +import Foundation +import ApplicationServices + +// MARK: - Missing AX constants (define as CFString if not exported by SDK) +private let kAXStringForTextMarkerRangeParameterizedAttribute: CFString = "AXStringForTextMarkerRange" as CFString +private let kAXLengthForTextMarkerRangeParameterizedAttribute: CFString = "AXLengthForTextMarkerRange" as CFString +private let kAXTextMarkerRangeForUIElementParameterizedAttribute: CFString = "AXTextMarkerRangeForUIElement" as CFString +private let kAXSelectedTextMarkerRangeAttribute: CFString = "AXSelectedTextMarkerRange" as CFString +private let kAXDocumentRangeAttribute: CFString = "AXDocumentRange" as CFString + +// Some SDK constants surface as String; cast to CFString at use-sites. +@inline(__always) func CFs(_ s: String) -> CFString { s as CFString } + +// MARK: - Data Structures + +struct CursorPosition: Codable { let offset: Int; let line: Int?; let column: Int? } +struct TextRange: Codable { let start: Int; let end: Int; let length: Int } + +struct CursorContext: Codable { + let textBefore: String + let textAfter: String + let selectedText: String + let cursorPosition: CursorPosition + let selectionRange: TextRange? + let truncated: Bool + let totalLength: Int + let timestamp: String +} + +struct CursorContextResult: Codable { + let success: Bool + let context: CursorContext? + let error: String? + let method: String + let durationMs: Int +} + +// MARK: - Utilities + +var DEBUG_LOG = false +@inline(__always) func dlog(_ s: String) { if DEBUG_LOG { fputs(s + "\n", stderr) } } + +@inline(__always) +func axErrorToString(_ e: AXError) -> String { + switch e { + case .success: return "Success" + case .failure: return "Failure" + case .illegalArgument: return "IllegalArgument" + case .invalidUIElement: return "InvalidUIElement" + case .invalidUIElementObserver: return "InvalidUIElementObserver" + case .cannotComplete: return "CannotComplete" + case .attributeUnsupported: return "AttributeUnsupported" + case .actionUnsupported: return "ActionUnsupported" + case .notificationUnsupported: return "NotificationUnsupported" + case .notImplemented: return "NotImplemented" + case .notificationAlreadyRegistered: return "NotificationAlreadyRegistered" + case .notificationNotRegistered: return "NotificationNotRegistered" + case .apiDisabled: return "APIDisabled" + case .noValue: return "NoValue" + case .parameterizedAttributeUnsupported: return "ParameterizedAttributeUnsupported" + case .notEnoughPrecision: return "NotEnoughPrecision" + @unknown default: return "Unknown(\(e.rawValue))" + } +} + +// Use CFTypeRef? so we control casts explicitly. +@inline(__always) +func axCopyAttr(_ element: AXUIElement, _ name: CFString) -> CFTypeRef? { + var v: CFTypeRef? + let r = AXUIElementCopyAttributeValue(element, name, &v) + return r == .success ? v : nil +} + +@inline(__always) +func axCopyParam(_ element: AXUIElement, _ name: CFString, _ param: CFTypeRef) -> CFTypeRef? { + var v: CFTypeRef? + let r = AXUIElementCopyParameterizedAttributeValue(element, name, param, &v) + return r == .success ? v : nil +} + +@inline(__always) +func axStringForRange(_ element: AXUIElement, location: Int, length: Int) -> String? { + var cfRange = CFRange(location: location, length: length) + guard let axRange = AXValueCreate(.cfRange, &cfRange) else { return nil } + return axCopyParam(element, CFs(kAXStringForRangeParameterizedAttribute as String), axRange) as? String +} + +@inline(__always) +func axStringForMarkerRange(_ element: AXUIElement, _ mr: CFTypeRef) -> String? { + axCopyParam(element, kAXStringForTextMarkerRangeParameterizedAttribute, mr) as? String +} + +@inline(__always) +func axLengthForMarkerRange(_ element: AXUIElement, _ mr: CFTypeRef) -> Int? { + (axCopyParam(element, kAXLengthForTextMarkerRangeParameterizedAttribute, mr) as? NSNumber)?.intValue +} + +// Document marker range: attribute or parameterized +func documentMarkerRange(_ element: AXUIElement) -> CFTypeRef? { + if let v = axCopyAttr(element, kAXDocumentRangeAttribute) { return v } + if let v = axCopyParam(element, kAXTextMarkerRangeForUIElementParameterizedAttribute, element) { return v } + return nil +} + +func selectedMarkerRange(_ element: AXUIElement) -> CFTypeRef? { + axCopyAttr(element, kAXSelectedTextMarkerRangeAttribute) +} + +// MARK: - Diagnostics + +// Inspect element in detail - shows ALL attributes +func inspectElement(_ element: AXUIElement, label: String) { + guard DEBUG_LOG else { return } + + fputs("\n>>> INSPECTING: \(label)\n", stderr) + + // Get role + if let role = axCopyAttr(element, CFs(kAXRoleAttribute as String)) as? String { + fputs(" Role: \(role)\n", stderr) + } + + // Get all available attribute names + var attrNames: CFArray? + if AXUIElementCopyAttributeNames(element, &attrNames) == .success, let names = attrNames as? [String] { + fputs(" Available attributes (\(names.count)):\n", stderr) + for name in names.prefix(20) { // Limit to first 20 to avoid spam + fputs(" - \(name)\n", stderr) + } + if names.count > 20 { + fputs(" ... (\(names.count - 20) more)\n", stderr) + } + + // Check interesting attributes for debugging + let interestingAttrs = ["AXDescription", "AXTitle", "AXHelp", "AXPlaceholderValue", "ChromeAXNodeId", "AXDOMIdentifier", "AXDOMClassList"] + fputs(" Checking interesting attributes:\n", stderr) + for attr in interestingAttrs { + if let value = axCopyAttr(element, attr as CFString) { + if let str = value as? String, !str.isEmpty { + fputs(" \(attr) = \"\(str)\"\n", stderr) + } else if let num = value as? NSNumber { + fputs(" \(attr) = \(num)\n", stderr) + } + } + } + } + + // Check for children + if let childrenAny = axCopyAttr(element, CFs(kAXChildrenAttribute as String)), + let children = childrenAny as? [AXUIElement] { + fputs(" Children: \(children.count)\n", stderr) + if children.count > 0 && children.count <= 5 { + for (i, child) in children.enumerated() { + if let role = axCopyAttr(child, CFs(kAXRoleAttribute as String)) as? String { + fputs(" [\(i)]: \(role)\n", stderr) + } + } + } else if children.count > 5 { + fputs(" (first 5 of \(children.count))\n", stderr) + for i in 0..<5 { + if let role = axCopyAttr(children[i], CFs(kAXRoleAttribute as String)) as? String { + fputs(" [\(i)]: \(role)\n", stderr) + } + } + } + } + + fputs("<<<\n\n", stderr) +} + +// Structured logging helpers +func logMethodStart(_ method: String) { + dlog("[\(method)] Attempting...") +} + +func logMethodSuccess(_ method: String, _ detail: String = "") { + let msg = detail.isEmpty ? "✓ Success" : "✓ \(detail)" + dlog("[\(method)] \(msg)") +} + +func logMethodFailure(_ method: String, _ reason: String) { + dlog("[\(method)] ✗ \(reason)") +} + +func logMethodSkip(_ method: String, _ reason: String) { + dlog("[\(method)] → Skipping: \(reason)") +} + +// MARK: - Extraction Paths + +// Path A: Classic Cocoa (kAXValue + kAXSelectedTextRange) +func valueBasedContext(_ element: AXUIElement, maxBefore: Int, maxAfter: Int, startTime: Date) -> CursorContextResult? { + logMethodStart("VALUE_METHOD") + + // Try to get AXValue + guard let fullText = axCopyAttr(element, CFs(kAXValueAttribute as String)) as? String else { + logMethodFailure("VALUE_METHOD", "AXValue attribute not available or not a string") + logMethodSkip("VALUE_METHOD", "no value attribute") + return nil + } + + guard !fullText.isEmpty && fullText.count > 0 else { + logMethodFailure("VALUE_METHOD", "AXValue is empty (length: \(fullText.count))") + logMethodSkip("VALUE_METHOD", "value is empty") + return nil + } + + logMethodSuccess("VALUE_METHOD", "Got AXValue with \(fullText.count) characters") + + let selectedText = (axCopyAttr(element, CFs(kAXSelectedTextAttribute as String)) as? String) ?? "" + var cursorOffset = 0 + var selection: TextRange? = nil + + if let any = axCopyAttr(element, CFs(kAXSelectedTextRangeAttribute as String)) { + let axv = any as! AXValue // CFTypeRef → AXValue (CoreFoundation type; explicit cast) + var cfRange = CFRange(location: 0, length: 0) + if AXValueGetValue(axv, .cfRange, &cfRange) { + cursorOffset = cfRange.location + if cfRange.length > 0 { + selection = TextRange(start: cfRange.location, + end: cfRange.location + cfRange.length, + length: cfRange.length) + } + logMethodSuccess("VALUE_METHOD", "Got cursor position at offset \(cursorOffset)") + } + } else { + dlog("[VALUE_METHOD] No AXSelectedTextRange (cursor will default to 0)") + } + + let totalLength = fullText.count + let startOffset = max(0, cursorOffset - maxBefore) + let endOffset = min(totalLength, cursorOffset + (selection?.length ?? 0) + maxAfter) + + let startIndex = fullText.index(fullText.startIndex, offsetBy: startOffset) + let cursorIndex = fullText.index(fullText.startIndex, offsetBy: cursorOffset) + let endIndex = fullText.index(fullText.startIndex, offsetBy: endOffset) + + let textBefore = String(fullText[startIndex.. 0) || (endOffset < totalLength) + + logMethodSuccess("VALUE_METHOD", "Returning context (before: \(textBefore.count), after: \(textAfter.count))") + + let context = CursorContext( + textBefore: textBefore, + textAfter: textAfter, + selectedText: selectedText, + cursorPosition: CursorPosition(offset: cursorOffset, line: nil, column: nil), + selectionRange: selection, + truncated: truncated, + totalLength: totalLength, + timestamp: ISO8601DateFormatter().string(from: Date()) + ) + let elapsed = Int(Date().timeIntervalSince(startTime) * 1000) + return CursorContextResult(success: true, context: context, error: nil, method: "accessibility:value", durationMs: elapsed) +} + +// Path B: Marker-based (Chromium/WebKit/Electron). +struct MarkerContext { + let fullText: String + let before: String + let after: String + let selected: String + let cursorOffset: Int + let selectionLength: Int + let totalLength: Int +} + +func markerBasedContext(_ element: AXUIElement, maxBefore: Int, maxAfter: Int) -> MarkerContext? { + // Try to get document marker range + guard let docMR_any = documentMarkerRange(element) else { + logMethodFailure("MARKER_METHOD", "No document marker range available") + return nil + } + logMethodSuccess("MARKER_METHOD", "Got document marker range") + + // Try to get full text from document range + guard let fullText = axStringForMarkerRange(element, docMR_any) else { + logMethodFailure("MARKER_METHOD", "Could not extract text from document marker range") + return nil + } + + guard let docLen = axLengthForMarkerRange(element, docMR_any) else { + logMethodFailure("MARKER_METHOD", "Could not get length for document marker range") + return nil + } + + // Reject empty text - this means the API is available but not actually providing content + guard docLen > 0 && !fullText.isEmpty else { + logMethodFailure("MARKER_METHOD", "Document marker range returned empty text (length: \(docLen))") + return nil + } + + logMethodSuccess("MARKER_METHOD", "Got document text with \(docLen) characters") + + // Try to get selected marker range + guard let selMR_any = selectedMarkerRange(element) else { + logMethodFailure("MARKER_METHOD", "No selected marker range available") + return nil + } + logMethodSuccess("MARKER_METHOD", "Got selected marker range") + + // CFTypeRef → AXTextMarkerRange (explicit casts) + let docMR = docMR_any as! AXTextMarkerRange + let selMR = selMR_any as! AXTextMarkerRange + + // Extract start markers + let docStartMarker: AXTextMarker = AXTextMarkerRangeCopyStartMarker(docMR) + let selStartMarker: AXTextMarker = AXTextMarkerRangeCopyStartMarker(selMR) + // (selEnd available if needed) + _ = AXTextMarkerRangeCopyEndMarker(selMR) + + // Build [docStart, selStart) to measure caret offset (returns non-optional) + let startToSelStart: AXTextMarkerRange = AXTextMarkerRangeCreate(kCFAllocatorDefault, docStartMarker, selStartMarker) + let cursorOffset = axLengthForMarkerRange(element, startToSelStart) ?? 0 + + let selectedText = axStringForMarkerRange(element, selMR) ?? "" + let selLen = axLengthForMarkerRange(element, selMR) ?? 0 + + logMethodSuccess("MARKER_METHOD", "Calculated cursor offset: \(cursorOffset), selection length: \(selLen)") + + // Window around caret in Swift + let safeCursor = max(0, min(cursorOffset, docLen)) + let afterEnd = min(docLen, safeCursor + selLen + maxAfter) + let beforeStart = max(0, safeCursor - maxBefore) + + let startIdx = fullText.startIndex + let beforeStartIdx = fullText.index(startIdx, offsetBy: beforeStart) + let cursorIdx = fullText.index(startIdx, offsetBy: safeCursor) + let afterEndIdx = fullText.index(startIdx, offsetBy: afterEnd) + + let before = String(fullText[beforeStartIdx.. CursorContextResult? { + let bigLen = 5_000_000 + + guard let fullText = axStringForRange(element, location: 0, length: bigLen) else { + logMethodFailure("RANGE_METHOD", "AXStringForRange not supported or failed") + return nil + } + + guard !fullText.isEmpty && fullText.count > 0 else { + logMethodFailure("RANGE_METHOD", "AXStringForRange returned empty text (length: \(fullText.count))") + return nil + } + + logMethodSuccess("RANGE_METHOD", "Got text via AXStringForRange (\(fullText.count) characters)") + + var cursorOffset = 0 + var selLen = 0 + if let any = axCopyAttr(element, CFs(kAXSelectedTextRangeAttribute as String)) { + let axv = any as! AXValue + var cfRange = CFRange(location: 0, length: 0) + if AXValueGetValue(axv, .cfRange, &cfRange) { + cursorOffset = cfRange.location + selLen = cfRange.length + logMethodSuccess("RANGE_METHOD", "Got cursor position at offset \(cursorOffset)") + } + } else { + dlog("[RANGE_METHOD] No AXSelectedTextRange (cursor will default to 0)") + } + + let totalLength = fullText.count + let startOffset = max(0, min(cursorOffset, totalLength) - maxBefore) + let endOffset = min(totalLength, cursorOffset + selLen + maxAfter) + + let startIndex = fullText.index(fullText.startIndex, offsetBy: startOffset) + let cursorIndex = fullText.index(fullText.startIndex, offsetBy: min(cursorOffset, totalLength)) + let endIndex = fullText.index(fullText.startIndex, offsetBy: endOffset) + + let textBefore = String(fullText[startIndex.. 0) || (endOffset < totalLength) + + let selectedText: String = selLen > 0 ? (axStringForRange(element, location: cursorOffset, length: selLen) ?? "") : "" + + let selectionRange: TextRange? = selLen > 0 + ? TextRange(start: cursorOffset, end: cursorOffset + selLen, length: selLen) + : nil + + logMethodSuccess("RANGE_METHOD", "Returning context (before: \(textBefore.count), after: \(textAfter.count))") + + let context = CursorContext( + textBefore: textBefore, + textAfter: textAfter, + selectedText: selectedText, + cursorPosition: CursorPosition(offset: min(cursorOffset, totalLength), line: nil, column: nil), + selectionRange: selectionRange, + truncated: truncated, + totalLength: totalLength, + timestamp: ISO8601DateFormatter().string(from: Date()) + ) + let elapsed = Int(Date().timeIntervalSince(startTime) * 1000) + return CursorContextResult(success: true, context: context, error: nil, method: "accessibility:range", durationMs: elapsed) +} + +// Try focused element, then a single parent hop +func bestRangeContextWithParentHop(_ element: AXUIElement, maxBefore: Int, maxAfter: Int, startTime: Date) -> CursorContextResult? { + logMethodStart("RANGE_METHOD") + dlog("[RANGE_METHOD] Trying on focused element...") + + if let r = rangeBasedContext(element, maxBefore: maxBefore, maxAfter: maxAfter, startTime: startTime) { + logMethodSuccess("RANGE_METHOD", "Succeeded on focused element") + return r + } + + // Check if focused element is a text role - if so, skip parent to avoid UI chrome + let textRoles = ["AXTextArea", "AXTextField", "AXWebArea"] + if let role = axCopyAttr(element, CFs(kAXRoleAttribute as String)) as? String { + if textRoles.contains(role) { + dlog("[RANGE_METHOD] Focused element is \(role) but has no text - skipping parent to avoid UI chrome") + logMethodSkip("RANGE_METHOD", "text role element with no content (will try tree traversal)") + return nil + } + } + + dlog("[RANGE_METHOD] Failed on focused element, trying parent...") + if let parentAny = axCopyAttr(element, CFs(kAXParentAttribute as String)) { + let parent = parentAny as! AXUIElement + if let r = rangeBasedContext(parent, maxBefore: maxBefore, maxAfter: maxAfter, startTime: startTime) { + logMethodSuccess("RANGE_METHOD", "Succeeded on parent element") + return r + } + dlog("[RANGE_METHOD] Failed on parent element too") + } else { + dlog("[RANGE_METHOD] No parent element available") + } + + logMethodSkip("RANGE_METHOD", "range API not available on element or parent") + return nil +} + +func bestMarkerContextWithParentHop(_ element: AXUIElement, maxBefore: Int, maxAfter: Int) -> (MarkerContext, String)? { + logMethodStart("MARKER_METHOD") + dlog("[MARKER_METHOD] Trying on focused element...") + + if let m = markerBasedContext(element, maxBefore: maxBefore, maxAfter: maxAfter) { + logMethodSuccess("MARKER_METHOD", "Succeeded on focused element") + return (m, "accessibility:marker") + } + + // Check if focused element is a text role (e.g., AXTextArea, AXTextField) + // If so, don't try parent - it likely contains UI chrome + let textRoles = ["AXTextArea", "AXTextField", "AXWebArea"] + if let role = axCopyAttr(element, CFs(kAXRoleAttribute as String)) as? String { + if textRoles.contains(role) { + dlog("[MARKER_METHOD] Focused element is \(role) but has no text - skipping parent to avoid UI chrome") + logMethodSkip("MARKER_METHOD", "text role element with no content (will try tree traversal)") + return nil + } + } + + dlog("[MARKER_METHOD] Failed on focused element, trying parent...") + if let parentAny = axCopyAttr(element, CFs(kAXParentAttribute as String)) { + let parent = parentAny as! AXUIElement + if let m = markerBasedContext(parent, maxBefore: maxBefore, maxAfter: maxAfter) { + logMethodSuccess("MARKER_METHOD", "Succeeded on parent element") + return (m, "accessibility:marker(parent)") + } + dlog("[MARKER_METHOD] Failed on parent element too") + } else { + dlog("[MARKER_METHOD] No parent element available") + } + + logMethodSkip("MARKER_METHOD", "marker API not available on element or parent") + return nil +} + +// MARK: - Path D: Electron Tree Traversal + +// Recursively search for text-capable elements in the accessibility tree +func findTextElements(_ element: AXUIElement, maxDepth: Int, currentDepth: Int = 0) -> [AXUIElement] { + var results: [AXUIElement] = [] + + if currentDepth >= maxDepth { + return results + } + + // Check if current element might have text content + let textRoles = ["AXTextArea", "AXTextField", "AXWebArea", "AXGroup", "AXScrollArea"] + if let role = axCopyAttr(element, CFs(kAXRoleAttribute as String)) as? String { + if textRoles.contains(role) { + // Check if it has any text-related attributes + let hasValue = axCopyAttr(element, CFs(kAXValueAttribute as String)) != nil + let hasMarkerRange = documentMarkerRange(element) != nil + let hasNumberOfChars = axCopyAttr(element, CFs(kAXNumberOfCharactersAttribute as String)) != nil + + if hasValue || hasMarkerRange || hasNumberOfChars { + results.append(element) + dlog("[TREE_TRAVERSAL] Found potential text element: \(role)") + } + } + } + + // Search children + if let childrenAny = axCopyAttr(element, CFs(kAXChildrenAttribute as String)), + let children = childrenAny as? [AXUIElement] { + for child in children { + results.append(contentsOf: findTextElements(child, maxDepth: maxDepth, currentDepth: currentDepth + 1)) + } + } + + return results +} + +// Try extraction methods on a candidate element +func tryExtractFromElement(_ element: AXUIElement, maxBefore: Int, maxAfter: Int, startTime: Date) -> CursorContextResult? { + // Try value-based first + if let result = valueBasedContext(element, maxBefore: maxBefore, maxAfter: maxAfter, startTime: startTime) { + return result + } + + // Try marker-based (without parent hop, we're already traversing) + if let mc = markerBasedContext(element, maxBefore: maxBefore, maxAfter: maxAfter) { + let truncated = (mc.cursorOffset > maxBefore) || (mc.cursorOffset + mc.selectionLength + maxAfter < mc.totalLength) + let selRange: TextRange? = mc.selectionLength > 0 + ? TextRange(start: mc.cursorOffset, end: mc.cursorOffset + mc.selectionLength, length: mc.selectionLength) + : nil + + let context = CursorContext( + textBefore: mc.before, + textAfter: mc.after, + selectedText: mc.selected, + cursorPosition: CursorPosition(offset: mc.cursorOffset, line: nil, column: nil), + selectionRange: selRange, + truncated: truncated, + totalLength: mc.totalLength, + timestamp: ISO8601DateFormatter().string(from: Date()) + ) + let elapsed = Int(Date().timeIntervalSince(startTime) * 1000) + return CursorContextResult(success: true, context: context, error: nil, method: "accessibility:marker(tree)", durationMs: elapsed) + } + + // Try range-based + if let result = rangeBasedContext(element, maxBefore: maxBefore, maxAfter: maxAfter, startTime: startTime) { + return result + } + + return nil +} + +// Path D: Electron/Chromium tree traversal fallback +func electronTreeTraversal(_ focusedElement: AXUIElement, maxBefore: Int, maxAfter: Int, startTime: Date) -> CursorContextResult? { + logMethodStart("TREE_TRAVERSAL") + dlog("[TREE_TRAVERSAL] Searching descendants for text elements (Electron fallback)...") + + inspectElement(focusedElement, label: "Focused Element (Tree Traversal)") + + // Search children up to 4 levels deep + let candidates = findTextElements(focusedElement, maxDepth: 4) + dlog("[TREE_TRAVERSAL] Found \(candidates.count) candidate text element(s)") + + // Try extraction on each candidate + for (index, candidate) in candidates.enumerated() { + if let role = axCopyAttr(candidate, CFs(kAXRoleAttribute as String)) as? String { + dlog("[TREE_TRAVERSAL] Trying candidate \(index + 1)/\(candidates.count): \(role)") + } + + if let result = tryExtractFromElement(candidate, maxBefore: maxBefore, maxAfter: maxAfter, startTime: startTime) { + logMethodSuccess("TREE_TRAVERSAL", "Found text in descendant element \(index + 1)") + return result + } + } + + // Also try parent traversal (go up instead of down) + dlog("[TREE_TRAVERSAL] No success in descendants, trying ancestors...") + var currentElement = focusedElement + for level in 1...3 { + guard let parentAny = axCopyAttr(currentElement, CFs(kAXParentAttribute as String)) else { + dlog("[TREE_TRAVERSAL] No more ancestors at level \(level)") + break + } + + let parent = parentAny as! AXUIElement + if let role = axCopyAttr(parent, CFs(kAXRoleAttribute as String)) as? String { + dlog("[TREE_TRAVERSAL] Trying ancestor at level \(level): \(role)") + } + + if let result = tryExtractFromElement(parent, maxBefore: maxBefore, maxAfter: maxAfter, startTime: startTime) { + logMethodSuccess("TREE_TRAVERSAL", "Found text in ancestor at level \(level)") + return result + } + + currentElement = parent + } + + logMethodSkip("TREE_TRAVERSAL", "no text found in tree traversal") + return nil +} + +// MARK: - Core + +func getCursorContext(maxCharsBefore: Int, maxCharsAfter: Int) -> CursorContextResult { + let startTime = Date() + + guard let frontmostApp = NSWorkspace.shared.frontmostApplication else { + let elapsed = Int(Date().timeIntervalSince(startTime) * 1000) + return CursorContextResult(success: false, context: nil, error: "No frontmost application", method: "accessibility", durationMs: elapsed) + } + let pid = frontmostApp.processIdentifier + let appElement = AXUIElementCreateApplication(pid) + + // Help Chromium/Electron + AXUIElementSetAttributeValue(appElement, "AXEnhancedUserInterface" as CFString, kCFBooleanTrue) + AXUIElementSetAttributeValue(appElement, "AXManualAccessibility" as CFString, kCFBooleanTrue) + dlog("✅ Enabled AXEnhancedUserInterface and AXManualAccessibility") + dlog("🔍 Focused Application: \(frontmostApp.localizedName ?? "Unknown")") + + // Trigger lazy initialization of accessibility tree by reading the role attribute + // This ensures the accessibility hierarchy is built before we query for focused element + // See: https://stackoverflow.com/questions/77954521 + let _ = axCopyAttr(appElement, CFs(kAXRoleAttribute as String)) + dlog("🔧 Triggered accessibility tree initialization via kAXRoleAttribute") + + // Poll for focused element up to 5 times with 10ms delays + // Handles lazy tree building in Chromium/Electron apps + var focusedElementObj: CFTypeRef? + var r: AXError = .failure + + for attempt in 1...5 { + r = AXUIElementCopyAttributeValue(appElement, CFs(kAXFocusedUIElementAttribute as String), &focusedElementObj) + + if r == .success && focusedElementObj != nil { + dlog("✅ Got focused element on attempt \(attempt)") + break + } + + if attempt < 5 { + Thread.sleep(forTimeInterval: 0.01) // 10ms + dlog("⏱️ Attempt \(attempt) failed, waiting 10ms...") + } + } + + // If we got a focused element, try the standard extraction methods + if r == .success, let focusedAny = focusedElementObj { + let element = focusedAny as! AXUIElement + + // 1) Value-based + if let v = valueBasedContext(element, maxBefore: maxCharsBefore, maxAfter: maxCharsAfter, startTime: startTime) { return v } + + // 2) Marker-based (+ parent hop) + if let (mc, methodTag) = bestMarkerContextWithParentHop(element, maxBefore: maxCharsBefore, maxAfter: maxCharsAfter) { + let truncated = (mc.cursorOffset > maxCharsBefore) || (mc.cursorOffset + mc.selectionLength + maxCharsAfter < mc.totalLength) + let selRange: TextRange? = mc.selectionLength > 0 + ? TextRange(start: mc.cursorOffset, end: mc.cursorOffset + mc.selectionLength, length: mc.selectionLength) + : nil + + let context = CursorContext( + textBefore: mc.before, + textAfter: mc.after, + selectedText: mc.selected, + cursorPosition: CursorPosition(offset: mc.cursorOffset, line: nil, column: nil), + selectionRange: selRange, + truncated: truncated, + totalLength: mc.totalLength, + timestamp: ISO8601DateFormatter().string(from: Date()) + ) + let elapsed = Int(Date().timeIntervalSince(startTime) * 1000) + return CursorContextResult(success: true, context: context, error: nil, method: methodTag, durationMs: elapsed) + } + + // 3) Range-based fallback (+ parent hop) + if let r = bestRangeContextWithParentHop(element, maxBefore: maxCharsBefore, maxAfter: maxCharsAfter, startTime: startTime) { return r } + + // 4) Electron tree traversal (comprehensive fallback) + dlog("\n⚠️ Standard methods failed, trying Electron tree traversal...") + if let r = electronTreeTraversal(element, maxBefore: maxCharsBefore, maxAfter: maxCharsAfter, startTime: startTime) { return r } + } else { + let errorDetail = "AXError: \(axErrorToString(r)), element nil: \(focusedElementObj == nil)" + dlog("❌ Failed to get focused element after 5 attempts: \(errorDetail)") + } + + let elapsed = Int(Date().timeIntervalSince(startTime) * 1000) + return CursorContextResult(success: false, context: nil, error: "Unable to retrieve text via Value, Marker, Range, or Tree Traversal", method: "accessibility", durationMs: elapsed) +} + +// MARK: - CLI Entry Point + +CursorContextCLI.main() diff --git a/package.json b/package.json index 102b161c..bae3661a 100644 --- a/package.json +++ b/package.json @@ -46,6 +46,7 @@ "build:rust:mac": "bash ./build-binaries.sh --mac", "build:rust:mac:x64": "bash ./build-binaries.sh --mac --x64", "build:rust:win": "bash ./build-binaries.sh --windows", + "build:swift:mac": "cd native/cursor-context && swift build -c release", "build:win": "bash ./build-app.sh windows", "build:mac": "bash ./build-app.sh mac", "generate:constants": "bun scripts/generate-constants.js", diff --git a/server/infra/lib/service-stack.ts b/server/infra/lib/service-stack.ts index f1160fc9..fd099c50 100644 --- a/server/infra/lib/service-stack.ts +++ b/server/infra/lib/service-stack.ts @@ -84,10 +84,10 @@ export class ServiceStack extends Stack { `${stageName}/ito/stripe-secret-key`, ) - const stripeWebhookSecretSecret = Secret.fromSecretCompleteArn( + const stripeWebhookSecret = Secret.fromSecretNameV2( this, 'StripeWebhookSecret', - `arn:aws:secretsmanager:${this.region}:${this.account}:secret:${stageName}/ito/stripe-webhook-secret-pALRS1`, + `${stageName}/ito/stripe-webhook`, ) // Setup domain and certificate @@ -119,7 +119,7 @@ export class ServiceStack extends Stack { groqApiKeySecret, cerebrasApiKeySecret, stripeSecretKeySecret, - stripeWebhookSecretSecret, + stripeWebhookSecret: stripeWebhookSecret, dbEndpoint: props.dbEndpoint, dbName: DB_NAME, dbPort: DB_PORT, diff --git a/server/infra/lib/service/fargate-task.ts b/server/infra/lib/service/fargate-task.ts index a4c8d412..93bab3e8 100644 --- a/server/infra/lib/service/fargate-task.ts +++ b/server/infra/lib/service/fargate-task.ts @@ -27,7 +27,7 @@ export interface FargateTaskConfig { groqApiKeySecret: ISecret cerebrasApiKeySecret: ISecret stripeSecretKeySecret: ISecret - stripeWebhookSecretSecret: ISecret + stripeWebhookSecret: ISecret dbEndpoint: string dbName: string dbPort: number @@ -60,7 +60,7 @@ export function createFargateTask( config.groqApiKeySecret.grantRead(fargateTaskRole) config.cerebrasApiKeySecret.grantRead(fargateTaskRole) config.stripeSecretKeySecret.grantRead(fargateTaskRole) - config.stripeWebhookSecretSecret.grantRead(fargateTaskRole) + config.stripeWebhookSecret.grantRead(fargateTaskRole) const taskExecutionRole = new IamRole(scope, 'ItoTaskExecRole', { assumedBy: new ServicePrincipal('ecs-tasks.amazonaws.com'), @@ -76,7 +76,7 @@ export function createFargateTask( config.groqApiKeySecret.grantRead(taskExecutionRole) config.cerebrasApiKeySecret.grantRead(taskExecutionRole) config.stripeSecretKeySecret.grantRead(taskExecutionRole) - config.stripeWebhookSecretSecret.grantRead(taskExecutionRole) + config.stripeWebhookSecret.grantRead(taskExecutionRole) // Explicitly add policy statement for secrets to ensure permissions are applied correctly // This is a workaround for cases where grantRead() might not work correctly with fromSecretNameV2() @@ -91,7 +91,7 @@ export function createFargateTask( config.groqApiKeySecret.secretArn, config.cerebrasApiKeySecret.secretArn, config.stripeSecretKeySecret.secretArn, - config.stripeWebhookSecretSecret.secretArn, + config.stripeWebhookSecret.secretArn, ], }), ) @@ -129,7 +129,7 @@ export function createFargateTask( config.stripeSecretKeySecret, ), STRIPE_WEBHOOK_SECRET: EcsSecret.fromSecretsManager( - config.stripeWebhookSecretSecret, + config.stripeWebhookSecret, ), }, environment: {