diff --git a/.vscode/extensions.json b/.vscode/extensions.json index bda493d8868c..b0d7a2ff1e6d 100644 --- a/.vscode/extensions.json +++ b/.vscode/extensions.json @@ -2,7 +2,7 @@ // See https://go.microsoft.com/fwlink/?LinkId=827846 // for the documentation about the extensions.json format "recommendations": [ - "eg2.tslint", + "ms-vscode.vscode-typescript-tslint-plugin", "editorconfig.editorconfig" ] } diff --git a/.vscode/launch.json b/.vscode/launch.json index 572a53e118d9..7454065992c4 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -131,8 +131,8 @@ "--ui=tdd", "--recursive", "--colors", - "--timeout=300000", - "--grep=" + //"--grep", "", + "--timeout=300000" ], "outFiles": [ "${workspaceFolder}/out/**/*.js" @@ -152,8 +152,8 @@ "--ui=tdd", "--recursive", "--colors", - "--timeout=300000", - "--grep=" + //"--grep", "", + "--timeout=300000" ], "outFiles": [ "${workspaceFolder}/out/**/*.js" diff --git a/news/3 Code Health/4852.md b/news/3 Code Health/4852.md index 14c4e854548a..b29c505021e8 100644 --- a/news/3 Code Health/4852.md +++ b/news/3 Code Health/4852.md @@ -1 +1 @@ -Hash imports so we can get every import and not just known ones. Should happen at most once every 5 seconds. \ No newline at end of file +Hash imports of top-level packages to see what users need supported. diff --git a/src/client/telemetry/constants.ts b/src/client/telemetry/constants.ts index 89bc0b9882fe..b734d71a1e8c 100644 --- a/src/client/telemetry/constants.ts +++ b/src/client/telemetry/constants.ts @@ -72,8 +72,7 @@ export enum EventName { SELECT_LINTER = 'LINTING.SELECT', LINTER_NOT_INSTALLED_PROMPT = 'LINTER_NOT_INSTALLED_PROMPT', - KNOWN_IMPORT_FROM_FILE = 'KNOWN_IMPORT_FROM_FILE', - KNOWN_IMPORT_FROM_EXECUTION = 'KNOWN_IMPORT_FROM_EXECUTION' + HASHED_PACKAGE_NAME = 'HASHED_PACKAGE_NAME' } export enum PlatformErrors { diff --git a/src/client/telemetry/importTracker.ts b/src/client/telemetry/importTracker.ts index 1d3f7ec787c1..9cf4af027593 100644 --- a/src/client/telemetry/importTracker.ts +++ b/src/client/telemetry/importTracker.ts @@ -11,12 +11,27 @@ import { sendTelemetryEvent } from '.'; import { noop } from '../../test/core'; import { IDocumentManager } from '../common/application/types'; import { isTestExecution } from '../common/constants'; -import { IHistoryProvider } from '../datascience/types'; -import { ICodeExecutionManager } from '../terminals/types'; import { EventName } from './constants'; import { IImportTracker } from './types'; -const ImportRegEx = /^(?!['"#]).*from\s+([a-zA-Z0-9_\.]+)\s+import.*(?!['"])|^(?!['"#]).*import\s+([a-zA-Z0-9_\., ]+).*(?!['"])/; +/* +Python has a fairly rich import statement, but luckily we only care about top-level (public) packages. +That means we can ignore: + +- Relative imports +- `as` rebindings +- The`fromlist` + +We can also ignore multi-line/parenthesized imports for simplicity since we don't' need 100% accuracy, +just enough to be able to tell what packages user's rely on to make sure we are covering our bases +in terms of support. + +We can rely on the fact that the use of the `from` and `import` keywords from the start of a line are +only usable for imports (`from` can also be used when raising an exception, but `raise` would be the +first keyword on a line in that instance). We also get to rely on the fact that we only care about +the top-level package, keeping the regex extremely greedy and simple for performance. +*/ +const ImportRegEx = /^\s*(from\s+(?\w+)|import\s+(?(\w+(?:\s*,\s*)?)+))/; const MAX_DOCUMENT_LINES = 1000; // Capture isTestExecution on module load so that a test can turn it off and still @@ -32,25 +47,14 @@ export class ImportTracker implements IImportTracker { private hashFn = require('hash.js').sha256; constructor( - @inject(IDocumentManager) private documentManager: IDocumentManager, - @inject(IHistoryProvider) private historyProvider: IHistoryProvider, - @inject(ICodeExecutionManager) private executionManager: ICodeExecutionManager + @inject(IDocumentManager) private documentManager: IDocumentManager ) { - // Sign up for document open/save events so we can track known imports this.documentManager.onDidOpenTextDocument((t) => this.onOpenedOrSavedDocument(t)); this.documentManager.onDidSaveTextDocument((t) => this.onOpenedOrSavedDocument(t)); - - // Sign up for history execution events (user can input code here too) - this.historyProvider.onExecutedCode(c => this.onExecutedCode(c)); - - // Sign up for terminal execution events (user can send code to the terminal) - // However we won't get any text typed directly into the terminal. Not part of the VS code API - // Could potentially hook stdin? Not sure that's possible. - this.executionManager.onExecutedCode(c => this.onExecutedCode(c)); } public async activate(): Promise { - // Act like all of our open documents just opened. Our timeout will make sure this is delayed + // Act like all of our open documents just opened; our timeout will make sure this is delayed. this.documentManager.textDocuments.forEach(d => this.onOpenedOrSavedDocument(d)); } @@ -66,9 +70,8 @@ export class ImportTracker implements IImportTracker { } private onOpenedOrSavedDocument(document: TextDocument) { - // Make sure this is a python file. + // Make sure this is a Python file. if (path.extname(document.fileName) === '.py') { - // Parse the contents of the document, looking for import matches on each line this.scheduleDocument(document); } } @@ -94,48 +97,39 @@ export class ImportTracker implements IImportTracker { private checkDocument(document: TextDocument) { this.pendingDocs.delete(document.fileName); const lines = this.getDocumentLines(document); - this.lookForImports(lines, EventName.KNOWN_IMPORT_FROM_FILE); + this.lookForImports(lines); } - private onExecutedCode(code: string) { - const lines = code.splitLines({ trim: true, removeEmptyEntries: true }); - this.lookForImports(lines, EventName.KNOWN_IMPORT_FROM_EXECUTION); + private sendTelemetry(packageName: string) { + // No need to send duplicate telemetry or waste CPU cycles on an unneeded hash. + if (this.sentMatches.has(packageName)) { + return; + } + this.sentMatches.add(packageName); + // Hash the package name so that we will never accidentally see a + // user's private package name. + const hash = this.hashFn().update(packageName).digest('hex'); + sendTelemetryEvent(EventName.HASHED_PACKAGE_NAME, undefined, {hashedName: hash}); } - private lookForImports(lines: (string | undefined)[], eventName: string) { + private lookForImports(lines: (string | undefined)[]) { try { - // Use a regex to parse each line, looking for imports - const matches: Set = new Set(); for (const s of lines) { const match = s ? ImportRegEx.exec(s) : null; - if (match && match.length > 2) { - // Could be a from or a straight import. from is the first entry. - const actual = match[1] ? match[1] : match[2]; - - // Use just the bits to the left of ' as ' - const left = actual.split(' as ')[0]; - - // Now split this based on, and chop off all . - const baseNames = left.split(',').map(l => l.split('.')[0].trim()); - baseNames.forEach(l => { - // Hash this value and save this in our import - const hash = this.hashFn().update(l).digest('hex'); - if (!this.sentMatches.has(hash)) { - matches.add(hash); - } - }); + if (match !== null && match.groups !== undefined) { + if (match.groups.fromImport !== undefined) { + // `from pkg ...` + this.sendTelemetry(match.groups.fromImport); + } else if (match.groups.importImport !== undefined) { + // `import pkg1, pkg2, ...` + const packageNames = match.groups.importImport.split(',').map(rawPackageName => rawPackageName.trim()); + // Can't pass in `this.sendTelemetry` directly as that rebinds `this`. + packageNames.forEach(p => this.sendTelemetry(p)); + } } } - - // For each unique match, emit a new telemetry event. - matches.forEach(s => { - sendTelemetryEvent( - eventName === EventName.KNOWN_IMPORT_FROM_FILE ? EventName.KNOWN_IMPORT_FROM_FILE : EventName.KNOWN_IMPORT_FROM_EXECUTION, - 0, - { import: s }); - this.sentMatches.add(s); - }); } catch { + // Don't care about failures since this is just telemetry. noop(); } } diff --git a/src/client/telemetry/index.ts b/src/client/telemetry/index.ts index f773e012178c..359fb3d15fe6 100644 --- a/src/client/telemetry/index.ts +++ b/src/client/telemetry/index.ts @@ -274,8 +274,7 @@ export interface IEventNamePropertyMapping { [EventName.FORMAT_SORT_IMPORTS]: never | undefined; [EventName.GO_TO_OBJECT_DEFINITION]: never | undefined; [EventName.HOVER_DEFINITION]: never | undefined; - [EventName.KNOWN_IMPORT_FROM_FILE]: { import: string }; - [EventName.KNOWN_IMPORT_FROM_EXECUTION]: { import: string }; + [EventName.HASHED_PACKAGE_NAME]: { hashedName: string }; [EventName.LINTER_NOT_INSTALLED_PROMPT]: LinterInstallPromptTelemetry; [EventName.PYTHON_INSTALL_PACKAGE]: { installer: string }; [EventName.LINTING]: LintingTelemetry; diff --git a/src/test/telemetry/importTracker.unit.test.ts b/src/test/telemetry/importTracker.unit.test.ts index 2c486c1c17a6..5fdb5c64b4ec 100644 --- a/src/test/telemetry/importTracker.unit.test.ts +++ b/src/test/telemetry/importTracker.unit.test.ts @@ -8,10 +8,8 @@ import * as TypeMoq from 'typemoq'; import { EventEmitter, TextDocument } from 'vscode'; import { IDocumentManager } from '../../client/common/application/types'; -import { IHistoryProvider } from '../../client/datascience/types'; import { EventName } from '../../client/telemetry/constants'; import { ImportTracker } from '../../client/telemetry/importTracker'; -import { ICodeExecutionManager } from '../../client/terminals/types'; import { createDocument } from '../datascience/editor-integration/helpers'; suite('Import Tracker', () => { @@ -21,12 +19,8 @@ suite('Import Tracker', () => { const hashJs = require('hash.js'); let importTracker: ImportTracker; let documentManager: TypeMoq.IMock; - let historyProvider: TypeMoq.IMock; - let codeExecutionManager: TypeMoq.IMock; let openedEventEmitter: EventEmitter; let savedEventEmitter: EventEmitter; - let historyEventEmitter: EventEmitter; - let codeExecutionEmitter: EventEmitter; const pandasHash = hashJs.sha256().update('pandas').digest('hex'); const elephasHash = hashJs.sha256().update('elephas').digest('hex'); const kerasHash = hashJs.sha256().update('keras').digest('hex'); @@ -54,21 +48,15 @@ suite('Import Tracker', () => { openedEventEmitter = new EventEmitter(); savedEventEmitter = new EventEmitter(); - historyEventEmitter = new EventEmitter(); - codeExecutionEmitter = new EventEmitter(); documentManager = TypeMoq.Mock.ofType(); - historyProvider = TypeMoq.Mock.ofType(); - codeExecutionManager = TypeMoq.Mock.ofType(); documentManager.setup(a => a.onDidOpenTextDocument).returns(() => openedEventEmitter.event); documentManager.setup(a => a.onDidSaveTextDocument).returns(() => savedEventEmitter.event); - historyProvider.setup(h => h.onExecutedCode).returns(() => historyEventEmitter.event); - codeExecutionManager.setup(c => c.onExecutedCode).returns(() => codeExecutionEmitter.event); rewiremock.enable(); rewiremock('vscode-extension-telemetry').with({ default: Reporter }); - importTracker = new ImportTracker(documentManager.object, historyProvider.object, codeExecutionManager.object); + importTracker = new ImportTracker(documentManager.object); }); teardown(() => { process.env.VSC_PYTHON_UNIT_TEST = oldValueOfVSC_PYTHON_UNIT_TEST; @@ -88,8 +76,8 @@ suite('Import Tracker', () => { test('Open document', () => { emitDocEvent('import pandas\r\n', openedEventEmitter); - expect(Reporter.eventNames).to.deep.equal([EventName.KNOWN_IMPORT_FROM_FILE]); - expect(Reporter.properties).to.deep.equal([{ import: pandasHash }]); + expect(Reporter.eventNames).to.deep.equal([EventName.HASHED_PACKAGE_NAME]); + expect(Reporter.properties).to.deep.equal([{hashedName: pandasHash}]); }); test('Already opened documents', async () => { @@ -97,80 +85,67 @@ suite('Import Tracker', () => { documentManager.setup(d => d.textDocuments).returns(() => [doc.object]); await importTracker.activate(); - expect(Reporter.eventNames).to.deep.equal([EventName.KNOWN_IMPORT_FROM_FILE]); - expect(Reporter.properties).to.deep.equal([{ import: pandasHash }]); + expect(Reporter.eventNames).to.deep.equal([EventName.HASHED_PACKAGE_NAME]); + expect(Reporter.properties).to.deep.equal([{hashedName: pandasHash}]); }); test('Save document', () => { emitDocEvent('import pandas\r\n', savedEventEmitter); - expect(Reporter.eventNames).to.deep.equal([EventName.KNOWN_IMPORT_FROM_FILE]); - expect(Reporter.properties).to.deep.equal([{ import: pandasHash }]); + expect(Reporter.eventNames).to.deep.equal([EventName.HASHED_PACKAGE_NAME]); + expect(Reporter.properties).to.deep.equal([{hashedName: pandasHash}]); }); - test('Execute', () => { - historyEventEmitter.fire('import pandas\r\n'); - - expect(Reporter.eventNames).to.deep.equal([EventName.KNOWN_IMPORT_FROM_EXECUTION]); - expect(Reporter.properties).to.deep.equal([{ import: pandasHash }]); - - codeExecutionEmitter.fire('import pandas\r\n'); - - // Should not emit another event. - expect(Reporter.eventNames).to.deep.equal([EventName.KNOWN_IMPORT_FROM_EXECUTION]); - expect(Reporter.properties).to.deep.equal([{ import: pandasHash }]); - }); - - test('elephas', () => { + test('from ._ import _, _', () => { const elephas = ` from elephas.java import java_classes, adapter from keras.models import Sequential from keras.layers import Dense - - + + model = Sequential() model.add(Dense(units=64, activation='relu', input_dim=100)) model.add(Dense(units=10, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy']) - + model.save('test.h5') - - + + kmi = java_classes.KerasModelImport file = java_classes.File("test.h5") - + java_model = kmi.importKerasSequentialModelAndWeights(file.absolutePath) - + weights = adapter.retrieve_keras_weights(java_model) model.set_weights(weights)`; - historyEventEmitter.fire(elephas); - expect(Reporter.properties).to.deep.equal([{ import: elephasHash }, { import: kerasHash }]); + emitDocEvent(elephas, savedEventEmitter); + expect(Reporter.properties).to.deep.equal([{hashedName: elephasHash}, {hashedName: kerasHash}]); }); - test('pyspark', () => { + test('from ._ import _', () => { const pyspark = `from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml import Pipeline from sparkdl import DeepImageFeaturizer - + featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") p = Pipeline(stages=[featurizer, lr]) - + model = p.fit(train_images_df) # train_images_df is a dataset of images and labels - + # Inspect training error df = model.transform(train_images_df.limit(10)).select("image", "probability", "uri", "label") predictionAndLabels = df.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Training set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))`; - historyEventEmitter.fire(pyspark); - expect(Reporter.properties).to.deep.equal([{ import: pysparkHash }, { import: sparkdlHash }]); + emitDocEvent(pyspark, savedEventEmitter); + expect(Reporter.properties).to.deep.equal([{hashedName: pysparkHash}, {hashedName: sparkdlHash}]); }); - test('numpy', () => { + test('import as _', () => { const code = `import pandas as pd import numpy as np import random as rnd @@ -182,11 +157,11 @@ def simplify_ages(df): categories = pd.cut(df.Age, bins, labels=group_names) df.Age = categories return df`; - historyEventEmitter.fire(code); - expect(Reporter.properties).to.deep.equal([{ import: pandasHash }, { import: numpyHash }, { import: randomHash }]); + emitDocEvent(code, savedEventEmitter); + expect(Reporter.properties).to.deep.equal([{hashedName: pandasHash}, {hashedName: numpyHash}, {hashedName: randomHash}]); }); - test('scipy', () => { + test('from import _', () => { const code = `from scipy import special def drumhead_height(n, k, distance, angle, t): kth_zero = special.jn_zeros(n, k)[-1] @@ -196,11 +171,11 @@ radius = np.r_[0:1:50j] x = np.array([r * np.cos(theta) for r in radius]) y = np.array([r * np.sin(theta) for r in radius]) z = np.array([drumhead_height(1, 1, r, theta, 0.5) for r in radius])`; - historyEventEmitter.fire(code); - expect(Reporter.properties).to.deep.equal([{ import: scipyHash }]); + emitDocEvent(code, savedEventEmitter); + expect(Reporter.properties).to.deep.equal([{hashedName: scipyHash}]); }); - test('function', () => { + test('Import from within a function', () => { const code = ` def drumhead_height(n, k, distance, angle, t): import sklearn as sk @@ -210,11 +185,11 @@ radius = np.r_[0:1:50j] x = np.array([r * np.cos(theta) for r in radius]) y = np.array([r * np.sin(theta) for r in radius]) z = np.array([drumhead_height(1, 1, r, theta, 0.5) for r in radius])`; - historyEventEmitter.fire(code); - expect(Reporter.properties).to.deep.equal([{ import: sklearnHash }]); + emitDocEvent(code, savedEventEmitter); + expect(Reporter.properties).to.deep.equal([{hashedName: sklearnHash}]); }); - test('Comma separated', () => { + test('import , ', () => { const code = ` def drumhead_height(n, k, distance, angle, t): import sklearn, pandas @@ -224,9 +199,21 @@ radius = np.r_[0:1:50j] x = np.array([r * np.cos(theta) for r in radius]) y = np.array([r * np.sin(theta) for r in radius]) z = np.array([drumhead_height(1, 1, r, theta, 0.5) for r in radius])`; - historyEventEmitter.fire(code); - expect(Reporter.properties).to.deep.equal([{ import: sklearnHash }, { import: pandasHash }]); + emitDocEvent(code, savedEventEmitter); + expect(Reporter.properties).to.deep.equal([{hashedName: sklearnHash}, {hashedName: pandasHash}]); }); - // That's probably enough different variants of code to verify nothing is wonky. + test('Do not send the same package twice', () => { + const code = ` +import pandas +import pandas`; + emitDocEvent(code, savedEventEmitter); + expect(Reporter.properties).to.deep.equal([{hashedName: pandasHash}]); + }); + + test('Ignore relative imports', () => { + const code = 'from .pandas import not_real'; + emitDocEvent(code, savedEventEmitter); + expect(Reporter.properties).to.deep.equal([]); + }); });