From 4bef4c26618a1f50f2aa2c2527f63ac42f384a81 Mon Sep 17 00:00:00 2001 From: Thijs Louisse Date: Wed, 8 May 2024 14:10:04 +0200 Subject: [PATCH] feat: optimised-glob util --- .../src/program/utils/optimised-glob.js | 276 ++++++++++++++++++ .../program/utils/optimised-glob.test.js | 274 +++++++++++++++++ 2 files changed, 550 insertions(+) create mode 100644 packages-node/providence-analytics/src/program/utils/optimised-glob.js create mode 100644 packages-node/providence-analytics/test-node/program/utils/optimised-glob.test.js diff --git a/packages-node/providence-analytics/src/program/utils/optimised-glob.js b/packages-node/providence-analytics/src/program/utils/optimised-glob.js new file mode 100644 index 000000000..3db13b3d7 --- /dev/null +++ b/packages-node/providence-analytics/src/program/utils/optimised-glob.js @@ -0,0 +1,276 @@ +/* eslint-disable no-case-declarations */ +/* eslint-disable no-fallthrough */ +import nodeFs from 'fs'; +import path from 'path'; + +import { toPosixPath } from './to-posix-path.js'; + +const [nodeMajor] = process.versions.node.split('.').map(Number); + +if (nodeMajor < 18) { + throw new Error('[optimisedGlob] Node.js version 18 or higher is required'); +} + +/** + * @typedef {import('memfs').Volume|nodeFs} FsLike + * @typedef {{onlyDirectories:boolean;onlyFiles:boolean;deep:number;suppressErrors:boolean;fs: FsLike;cwd:string;absolute:boolean;extglob:boolean;}} FastGlobtions + */ + +/** + * @param {string} glob + * @param {string} [providedOpts] + * @param {boolean} [globstar=true] if true, '/foo/*' => '^\/foo\/[^/]*$' (not allowing folders inside *), else '/foo/*' => '^\/foo\/.*$' + * @param {boolean} [extglob=true] if true, supports so called "extended" globs (like bash) and single character matching, matching ranges of characters, group matching etc. + * @returns {RegExp} + */ +export function parseGlobToRegex(glob, providedOpts) { + if (typeof glob !== 'string') throw new TypeError('Expected a string'); + + const options = { + globstar: true, + extglob: true, + ...providedOpts, + }; + + let regexResultStr = ''; + let isInGroup = false; + let currentChar; + + for (let i = 0; i < glob.length; i += 1) { + currentChar = glob[i]; + + const charsToEscape = ['/', '$', '^', '+', '.', '(', ')', '=', '!', '|']; + if (charsToEscape.includes(currentChar)) { + regexResultStr += `\\${currentChar}`; + continue; // eslint-disable-line no-continue + } + + if (options.extglob) { + if (currentChar === '?') { + regexResultStr += '.'; + continue; // eslint-disable-line no-continue + } + if (['[', ']'].includes(currentChar)) { + regexResultStr += currentChar; + continue; // eslint-disable-line no-continue + } + if (currentChar === '{') { + isInGroup = true; + regexResultStr += '('; + continue; // eslint-disable-line no-continue + } + if (currentChar === '}') { + isInGroup = false; + regexResultStr += ')'; + continue; // eslint-disable-line no-continue + } + } + + if (currentChar === ',') { + if (isInGroup) { + regexResultStr += '|'; + continue; // eslint-disable-line no-continue + } + regexResultStr += `\\${currentChar}`; + continue; // eslint-disable-line no-continue + } + + if (currentChar === '*') { + const prevChar = glob[i - 1]; + let isMultiStar = false; + while (glob[i + 1] === '*') { + isMultiStar = true; + i += 1; + } + const nextChar = glob[i + 1]; + if (!options.globstar) { + // Treat any number of "*" as one + regexResultStr += '.*'; + } else { + const isGlobstarSegment = + isMultiStar && ['/', undefined].includes(prevChar) && ['/', undefined].includes(nextChar); + if (isGlobstarSegment) { + // Match zero or more path segments + regexResultStr += '((?:[^/]*(?:/|$))*)'; + // Move over the "/" + i += 1; + } else { + // Only match one path segment + regexResultStr += '([^/]*)'; + } + } + continue; // eslint-disable-line no-continue + } + regexResultStr += currentChar; + } + + return new RegExp(`^${regexResultStr}$`); +} + +/** + * @param {string} glob + */ +function getStartPath(glob) { + const reservedChars = ['?', '[', ']', '{', '}', ',', '.', '*']; + let hasFoundReservedChar = false; + return glob + .split('/') + .map(part => { + if (hasFoundReservedChar) return undefined; + hasFoundReservedChar = reservedChars.some(reservedChar => part.includes(reservedChar)); + return hasFoundReservedChar ? undefined : part; + }) + .filter(Boolean) + .join('/'); +} + +let isCacheEnabled = false; +/** @type {{[path:string]:nodeFs.Dirent[]}} */ +const cache = {}; + +/** + * @param {string} startPath + * @param {{fs?:FsLike}} providedOptions + * @returns {Promise|nodeFs.Dirent[]} + */ +function getAllFilesFromStartPath(fullStartPath, { fs = /** @type {* & FsLike} */ (nodeFs) } = {}) { + if (isCacheEnabled && cache[fullStartPath]) return cache[fullStartPath]; + + return new Promise((resolve, reject) => { + fs.promises + .readdir(fullStartPath, { withFileTypes: true, recursive: true }) + .then((/** @type {* & nodeFs.Dirent[]} */ files) => { + cache[fullStartPath] = files; + resolve(files); + }) + .catch(e => { + reject(e); + }); + }); +} + +/** + * Lightweight glob implementation. + * It's a drop-in replacement for globby, but it's faster, a few hundred lines of code and has no dependencies. + * @param {string|string[]} globOrGlobs + * @param {Partial} providedOptions + * @returns {Promise} + */ +export async function optimisedGlob(globOrGlobs, providedOptions = {}) { + const options = { + fs: /** @type {* & FsLike} */ (nodeFs), + onlyDirectories: false, + suppressErrors: true, + cwd: process.cwd(), + absolute: false, + onlyFiles: true, + deep: Infinity, + globstar: true, + extglob: true, + unique: true, + sync: false, + dot: false, + // TODO: ignore, throwErrorOnBrokenSymbolicLink, markDirectories, objectMode, onlyDirectories, onlyFiles, stats + // https://github.com/mrmlnc/fast-glob?tab=readme-ov-file + ...providedOptions, + }; + + if (!options.onlyFiles) { + // This makes behavior aligned with globby + options.onlyDirectories = true; + } + + const globs = Array.isArray(globOrGlobs) ? globOrGlobs : [globOrGlobs]; + + /** @type {RegExp[]} */ + const matchRegexesNegative = []; + /** @type {RegExp[]} */ + const matchRegexes = []; + /** @type {{dirent:nodeFs.Dirent;relativeToCwdPath:string}[]} */ + const globEntries = []; + + for (const glob of globs) { + const isNegative = glob.startsWith('!'); + + // Relative paths like './my/folder/**/*.js' are changed to 'my/folder/**/*.js' + const globNormalized = glob.replace(/^\.\//g, '').slice(isNegative ? 1 : 0); + + const regexForGlob = parseGlobToRegex(globNormalized, { + globstar: options.globstar, + extglob: options.extglob, + }); + if (isNegative) { + matchRegexesNegative.push(regexForGlob); + } else { + matchRegexes.push(regexForGlob); + } + + // Search for the "deepest" starting point in the filesystem that we can use to search the fs + const startPath = getStartPath(globNormalized); + const fullStartPath = path.join(options.cwd, startPath); + + try { + const allDirentsRelativeToStartPath = await getAllFilesFromStartPath(fullStartPath, { + fs: options.fs, + }); + + const allDirEntsRelativeToCwd = allDirentsRelativeToStartPath.map(dirent => ({ + // @ts-expect-error + relativeToCwdPath: toPosixPath(path.join(dirent.path, dirent.name)).replace( + `${options.cwd}/`, + '', + ), + + dirent, + })); + + globEntries.push(...allDirEntsRelativeToCwd); + } catch (e) { + if (!options.suppressErrors) { + throw e; + } + } + } + + // TODO: for perf, combine options checks instead of doing multiple filters and maps + const matchedEntries = globEntries.filter( + globEntry => + matchRegexes.some(globRe => globRe.test(globEntry.relativeToCwdPath)) && + !matchRegexesNegative.some(globReNeg => globReNeg.test(globEntry.relativeToCwdPath)), + ); + + const allFileOrDirectoryEntries = matchedEntries.filter(({ dirent }) => + options.onlyDirectories ? dirent.isDirectory() : dirent.isFile(), + ); + + let filteredPaths = allFileOrDirectoryEntries.map(({ relativeToCwdPath }) => relativeToCwdPath); + + if (!options.dot) { + filteredPaths = filteredPaths.filter( + f => !f.split('/').some(folderOrFile => folderOrFile.startsWith('.')), + ); + } + + if (options.absolute) { + filteredPaths = filteredPaths.map(f => path.posix.join(options.cwd, f)); + if (process.platform === 'win32') { + const driveLetter = path.win32.resolve(options.cwd).slice(0, 1).toUpperCase(); + filteredPaths = filteredPaths.map(f => `${driveLetter}:${f}`); + } + } + + if (options.deep !== Infinity) { + filteredPaths = filteredPaths.filter(f => f.split('/').length <= options.deep + 2); + } + + const result = options.unique ? Array.from(new Set(filteredPaths)) : filteredPaths; + + return result.sort((a, b) => { + const pathDiff = a.split('/').length - b.split('/').length; + return pathDiff !== 0 ? pathDiff : a.localeCompare(b); + }); +} + +optimisedGlob.disableCache = () => { + isCacheEnabled = false; +}; diff --git a/packages-node/providence-analytics/test-node/program/utils/optimised-glob.test.js b/packages-node/providence-analytics/test-node/program/utils/optimised-glob.test.js new file mode 100644 index 000000000..56ec63d5d --- /dev/null +++ b/packages-node/providence-analytics/test-node/program/utils/optimised-glob.test.js @@ -0,0 +1,274 @@ +import { globby } from 'globby'; +// eslint-disable-next-line import/no-extraneous-dependencies +import { expect } from 'chai'; +// import { vol } from 'memfs'; +// eslint-disable-next-line import/no-extraneous-dependencies +import mockFs from 'mock-fs'; + +import { optimisedGlob } from '../../../src/program/utils/optimised-glob.js'; + +const measurePerf = process.argv.includes('--measure-perf'); + +/** + * @param {*} patterns + * @param {*} options + * @returns + */ +async function runOptimisedGlobAndCheckGlobbyParity(patterns, options) { + performance.mark('start-optimisedGlob'); + const optimisedGlobResult = await optimisedGlob(patterns, options); + performance.mark('end-optimisedGlob'); + + performance.mark('start-globby'); + const globbyResult = await globby(patterns, options); + performance.mark('end-globby'); + + if (measurePerf) { + const optimisedGlobPerf = performance.measure( + 'optimisedGlob', + 'start-optimisedGlob', + 'end-optimisedGlob', + ); + const globbyPerf = performance.measure('globby', 'start-globby', 'end-globby'); + console.debug( + `optimisedGlob was ${ + globbyPerf.duration - optimisedGlobPerf.duration + }ms quicker than globby.`, + ); + } + + expect(optimisedGlobResult).to.deep.equal(globbyResult); + + return optimisedGlobResult; +} + +describe('optimisedGlob', () => { + const testCfg = { + cwd: '/fakeFs', + // fs: vol, + }; + + beforeEach(() => { + const fakeFs = { + '/fakeFs/my/folder/some/file.js': 'content', + '/fakeFs/my/folder/lvl1/some/file.js': 'content', + '/fakeFs/my/folder/lvl1/lvl2/some/file.js': 'content', + '/fakeFs/my/folder/lvl1/lvl2/lvl3/some/file.js': 'content', + '/fakeFs/my/folder/some/file.d.ts': 'content', + '/fakeFs/my/folder/lvl1/some/file.d.ts': 'content', + '/fakeFs/my/folder/lvl1/lvl2/some/file.d.ts': 'content', + '/fakeFs/my/folder/lvl1/lvl2/lvl3/some/file.d.ts': 'content', + + '/fakeFs/my/folder/some/anotherFile.js': 'content', + '/fakeFs/my/folder/lvl1/some/anotherFile.js': 'content', + '/fakeFs/my/folder/lvl1/lvl2/some/anotherFile.js': 'content', + '/fakeFs/my/folder/lvl1/lvl2/lvl3/some/anotherFile.js': 'content', + '/fakeFs/my/folder/some/anotherFile.d.ts': 'content', + '/fakeFs/my/folder/lvl1/some/anotherFile.d.ts': 'content', + '/fakeFs/my/folder/lvl1/lvl2/some/anotherFile.d.ts': 'content', + '/fakeFs/my/folder/lvl1/lvl2/lvl3/some/anotherFile.d.ts': 'content', + + '/fakeFs/my/.hiddenFile.js': 'content', + }; + + // vol.fromJSON(fakeFs); + mockFs(fakeFs); + }); + + afterEach(() => { + // vol.reset(); + mockFs.restore(); + }); + + describe('Star patterns', () => { + it('supports double asterisk like "my/folder/**/some/file.js" ', async () => { + const files = await runOptimisedGlobAndCheckGlobbyParity( + 'my/folder/**/some/file.js', + testCfg, + ); + + expect(files).to.deep.equal([ + 'my/folder/some/file.js', + 'my/folder/lvl1/some/file.js', + 'my/folder/lvl1/lvl2/some/file.js', + 'my/folder/lvl1/lvl2/lvl3/some/file.js', + ]); + }); + + it('supports single asterisk like "my/folder/*/some/file.js" ', async () => { + const files = await runOptimisedGlobAndCheckGlobbyParity('my/folder/*/some/file.js', testCfg); + + expect(files).to.deep.equal(['my/folder/lvl1/some/file.js']); + }); + + it('supports filenames like "my/folder/lvl1/some/*il*.js" ', async () => { + const files = await runOptimisedGlobAndCheckGlobbyParity( + 'my/folder/lvl1/some/*il*.js', + testCfg, + ); + + expect(files).to.deep.equal([ + 'my/folder/lvl1/some/anotherFile.js', + 'my/folder/lvl1/some/file.js', + ]); + }); + + it('supports globs starting with a star like "**/some/file.js" ', async () => { + const filesDoubleStar = await runOptimisedGlobAndCheckGlobbyParity( + '**/some/file.js', + testCfg, + ); + + expect(filesDoubleStar).to.deep.equal([ + 'my/folder/some/file.js', + 'my/folder/lvl1/some/file.js', + 'my/folder/lvl1/lvl2/some/file.js', + 'my/folder/lvl1/lvl2/lvl3/some/file.js', + ]); + + const filesSingleStar = await runOptimisedGlobAndCheckGlobbyParity( + '*/folder/some/file.js', + testCfg, + ); + + expect(filesSingleStar).to.deep.equal(['my/folder/some/file.js']); + }); + + it('gives empty output when location does not exist" ', async () => { + const files = await runOptimisedGlobAndCheckGlobbyParity('my/folder/**/some/file.js', { + ...testCfg, + cwd: '/nonExisting/path', // this will not exist + }); + + expect(files).to.deep.equal([]); + }); + + it('omits hidden files" ', async () => { + const files = await runOptimisedGlobAndCheckGlobbyParity('*/*/*/*', testCfg); + + expect(files).to.deep.equal([ + 'my/folder/some/anotherFile.d.ts', + 'my/folder/some/anotherFile.js', + 'my/folder/some/file.d.ts', + 'my/folder/some/file.js', + ]); + }); + }); + + describe('Accolade patterns', () => { + it('works with filenames like "my/folder/*/some/file.{js,d.ts}" ', async () => { + const files = await runOptimisedGlobAndCheckGlobbyParity( + 'my/folder/*/some/file.{js,d.ts}', + testCfg, + ); + + expect(files).to.deep.equal(['my/folder/lvl1/some/file.d.ts', 'my/folder/lvl1/some/file.js']); + }); + }); + + describe('Multiple globs', () => { + it('accepts an array of globs, like ["my/folder/*/some/file.js", "my/folder/lvl1/*/some/file.js"]', async () => { + const files = await runOptimisedGlobAndCheckGlobbyParity( + ['my/folder/*/some/file.js', 'my/folder/lvl1/*/some/file.js'], + testCfg, + ); + + expect(files).to.deep.equal([ + 'my/folder/lvl1/some/file.js', + 'my/folder/lvl1/lvl2/some/file.js', + ]); + }); + + it('accepts nedgated globs, like ["my/folder/**/some/file.js", "!my/folder/*/some/file.js"]', async () => { + const files = await runOptimisedGlobAndCheckGlobbyParity( + ['my/folder/**/some/file.js', '!my/folder/*/some/file.js'], + testCfg, + ); + + expect(files).to.deep.equal([ + 'my/folder/some/file.js', + 'my/folder/lvl1/lvl2/some/file.js', + 'my/folder/lvl1/lvl2/lvl3/some/file.js', + ]); + }); + }); + + describe('Options', () => { + it('"absolute" returns full system paths', async () => { + const files = await runOptimisedGlobAndCheckGlobbyParity('my/folder/*/some/file.{js,d.ts}', { + ...testCfg, + absolute: true, + }); + + expect(files).to.deep.equal([ + '/fakeFs/my/folder/lvl1/some/file.d.ts', + '/fakeFs/my/folder/lvl1/some/file.js', + ]); + }); + + it('"cwd" changes relative starting point of glob', async () => { + const files = await runOptimisedGlobAndCheckGlobbyParity('folder/*/some/file.{js,d.ts}', { + ...testCfg, + cwd: '/fakeFs/my', + }); + + expect(files).to.deep.equal(['folder/lvl1/some/file.d.ts', 'folder/lvl1/some/file.js']); + }); + + it('"onlyDirectories" returns only directories/folders', async () => { + const files = await runOptimisedGlobAndCheckGlobbyParity('my/folder/*/some', { + ...testCfg, + onlyDirectories: true, + }); + + expect(files).to.deep.equal(['my/folder/lvl1/some']); + }); + + it('"onlyFiles" returns only files', async () => { + const files = await runOptimisedGlobAndCheckGlobbyParity('my/folder/*/some', { + ...testCfg, + onlyFiles: true, + }); + + expect(files).to.deep.equal([]); + }); + + it('"deep" limits the level of results', async () => { + const files = await runOptimisedGlobAndCheckGlobbyParity('my/folder/**', { + ...testCfg, + onlyDirectories: true, + deep: 1, + }); + expect(files).to.deep.equal(['my/folder/lvl1', 'my/folder/some']); + + const files2 = await runOptimisedGlobAndCheckGlobbyParity('my/folder/**', { + ...testCfg, + onlyDirectories: true, + deep: 2, + }); + + expect(files2).to.deep.equal([ + 'my/folder/lvl1', + 'my/folder/some', + 'my/folder/lvl1/lvl2', + 'my/folder/lvl1/some', + ]); + }); + + it('"dot" allows hidden files" ', async () => { + const files = await runOptimisedGlobAndCheckGlobbyParity('*/*', { ...testCfg, dot: true }); + + expect(files).to.deep.equal(['my/.hiddenFile.js']); + }); + + it.skip('"suppressErrors" throws errors when paths do not exist', async () => { + expect(async () => + optimisedGlob('my/folder/**/some/file.js', { + ...testCfg, + cwd: '/nonExisting/path', // this will not exist + suppressErrors: false, + }), + ).to.throw(); + }); + }); +});