From 170472638f799fbb24c41adbc0128589a46b51d8 Mon Sep 17 00:00:00 2001 From: namatanda Date: Wed, 31 Dec 2025 22:03:27 +0300 Subject: [PATCH 1/4] feat: Migrate dictionary API from Google to Wiktionary, enhance HTML content cleaning, and adjust word normalization. --- README.md | 89 +++++++-------- app.js | 17 +-- modules/dictionary.js | 248 +++++++++++------------------------------- 3 files changed, 113 insertions(+), 241 deletions(-) diff --git a/README.md b/README.md index 04e2dad..2010fe7 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,29 @@ There was no free Dictionary API on the web when I wanted one for my friend, so I created one. +## Data Source & License + +This API uses [Wiktionary](https://en.wiktionary.org/) as its data source via the Wiktionary REST API. Wiktionary is a free, collaboratively edited multilingual dictionary. + +### Attribution + +Dictionary definitions provided by this API are sourced from [Wiktionary](https://en.wiktionary.org/), a project of the [Wikimedia Foundation](https://wikimediafoundation.org/). + +The content from Wiktionary is available under the [Creative Commons Attribution-ShareAlike 4.0 International License (CC-BY-SA 4.0)](https://creativecommons.org/licenses/by-sa/4.0/). + +**If you use this API, you must:** +- Provide attribution to Wiktionary as the source of the definitions +- Include a link to the CC-BY-SA 4.0 license +- If you modify the content, you must distribute your contributions under the same license + +For individual word entries, the original contributors can be found in the page history at `https://en.wiktionary.org/wiki/`. + ## Important Note The API usage has been ramping up rapidly, making it difficult for me to keep the server running due to increased AWS costs. Your support directly helps the development of Dictionary API and keeps the server running. - + ## Getting Started @@ -25,26 +42,18 @@ As an example, to get definition of English word **hello** using _v2_, you can s [ { "word": "hello", - "phonetic": "həˈləʊ", - "phonetics": [ - { - "text": "həˈləʊ", - "audio": "//ssl.gstatic.com/dictionary/static/sounds/20200429/hello--_gb_1.mp3" - }, - { - "text": "hɛˈləʊ" - } - ], - "origin": "early 19th century: variant of earlier hollo ; related to holla.", + "phonetics": [], "meanings": [ { - "partOfSpeech": "exclamation", + "partOfSpeech": "interjection", "definitions": [ { - "definition": "used as a greeting or to begin a phone conversation.", - "example": "hello there, Katie!", - "synonyms": [], - "antonyms": [] + "definition": "A greeting (salutation) said when meeting someone or acknowledging someone's arrival or presence.", + "example": "Hello, everyone." + }, + { + "definition": "A greeting used when answering the telephone.", + "example": "Hello? How may I help you?" } ] }, @@ -52,10 +61,8 @@ As an example, to get definition of English word **hello** using _v2_, you can s "partOfSpeech": "noun", "definitions": [ { - "definition": "an utterance of ‘hello’; a greeting.", - "example": "she was getting polite nods and hellos from people", - "synonyms": [], - "antonyms": [] + "definition": "\"Hello!\" or an equivalent greeting.", + "example": "They gave each other a quick hello when they met, and went back on their merry ways." } ] }, @@ -63,10 +70,7 @@ As an example, to get definition of English word **hello** using _v2_, you can s "partOfSpeech": "verb", "definitions": [ { - "definition": "say or shout ‘hello’.", - "example": "I pressed the phone button and helloed", - "synonyms": [], - "antonyms": [] + "definition": "To greet with \"hello\"." } ] } @@ -75,6 +79,8 @@ As an example, to get definition of English word **hello** using _v2_, you can s ] ``` +> **Note:** The API now uses Wiktionary as its data source. Response format remains compatible but some fields like `phonetic`, `origin`, `synonyms`, and `antonyms` may not always be present. + ### Regarding V1 Version The API earlier used to send response as shown below, but this structure of response was found out to be difficult to work with (you can take a look at these tickets [#32](https://github.com/meetDeveloper/freeDictionaryAPI/issues/32) and [#4](https://github.com/meetDeveloper/freeDictionaryAPI/issues/4)), based on feedback in these tickets I have updated the API to _v2_ version. But _v1_ version will always be supported for backward compatibility. @@ -82,40 +88,23 @@ The API earlier used to send response as shown below, but this structure of resp [ { "word": "hello", - "phonetic": "həˈləʊ", - "phonetics": [ - { - "text": "həˈləʊ", - "audio": "//ssl.gstatic.com/dictionary/static/sounds/20200429/hello--_gb_1.mp3" - }, - { - "text": "hɛˈləʊ" - } - ], - "origin": "early 19th century: variant of earlier hollo ; related to holla.", + "phonetics": [], "meaning": { - "exclamation": [ + "interjection": [ { - "definition": "used as a greeting or to begin a phone conversation.", - "example": "hello there, Katie!", - "synonyms": [], - "antonyms": [] + "definition": "A greeting (salutation) said when meeting someone.", + "example": "Hello, everyone." } ], "noun": [ { - "definition": "an utterance of ‘hello’; a greeting.", - "example": "she was getting polite nods and hellos from people", - "synonyms": [], - "antonyms": [] + "definition": "\"Hello!\" or an equivalent greeting.", + "example": "They gave each other a quick hello." } ], "verb": [ { - "definition": "say or shout ‘hello’.", - "example": "I pressed the phone button and helloed", - "synonyms": [], - "antonyms": [] + "definition": "To greet with \"hello\"." } ] } @@ -147,7 +136,7 @@ This Dictionary API was initially created as an API that could be used by my fri Kindly help me keep running and developing this API. Thanks a lot for using my API, it feels good when your creation help other create their own projects. - + ## Related Projects diff --git a/app.js b/app.js index 0865cd0..347ea8b 100644 --- a/app.js +++ b/app.js @@ -34,22 +34,23 @@ const { JSDOM } = require('jsdom'), // GLOBALS global._ = require('lodash'); -function cleanText (text) { +function cleanText(text) { if (!text) { return text; } - return parser - .parseFromString(text, "text/html") - .body.textContent; + const doc = parser.parseFromString(text, "text/html"); + const elementsToRemove = doc.querySelectorAll("style, script"); + elementsToRemove.forEach(el => el.remove()); + return doc.body.textContent; } -function handleError (error = {}) { +function handleError(error = {}) { // Using duck typing to know if we explicitly threw this error // If not then wrapping original error into UnexpectedError if (!error.requestType) { error = new errors.UnexpectedError({ original_error: error }); } const { requestType, title, message, resolution } = error; - status = REQUEST_TYPE_STATUS_CODE[requestType], + status = REQUEST_TYPE_STATUS_CODE[requestType], body = JSON.stringify({ title, message, @@ -77,7 +78,7 @@ app.get('/api/:version/entries/:language/:word', async (req, res) => { word = decodeURIComponent(word); if (!word || !language || !version) { - return handleError.call(res, new errors.NoDefinitionsFound()); + return handleError.call(res, new errors.NoDefinitionsFound()); } // @todo: Find better error. @@ -93,7 +94,7 @@ app.get('/api/:version/entries/:language/:word', async (req, res) => { // @todo: Find better error. if (!utils.isLanguageSupported(language)) { return handleError.call(res, new errors.NoDefinitionsFound()); } - word = word.trim().toLocaleLowerCase(language); + word = word.trim(); try { let definitions = await dictionary.findDefinitions(word, language, { include }), diff --git a/modules/dictionary.js b/modules/dictionary.js index afc9ac6..94a6e4c 100644 --- a/modules/dictionary.js +++ b/modules/dictionary.js @@ -1,193 +1,75 @@ -const fs = require('fs'), - _ = require('lodash'), - https = require('https'), - fetch = require('node-fetch'), +const fetch = require('node-fetch'), + errors = require('./errors.js'); - utils = require('./utils.js'), - errors = require('./errors.js'), - - httpsAgent = new https.Agent({ keepAlive: true }); - -function transformV2toV1 (data) { +function transformV2toV1(data) { return data.map((entry) => { - let { - meanings, - ...otherProps - } = entry; - - meanings = meanings.reduce((meanings, meaning) => { - let partOfSpeech, definitions; - - ({ - partOfSpeech, - definitions - } = meaning); - meanings[partOfSpeech] = definitions; - - return meanings; - }, {}); - - return { - ...otherProps, - meaning: meanings - }; - }); -} - -function transform (word, language, data, { include }) { - return data - .map(e => e.entry) - .filter(e => e) - .reduce((accumulator, entry) => { - if (!entry.subentries) { return accumulator.push(entry) && accumulator; } - - let { subentries } = entry, - mappedSubentries; - - if (subentries.length > 1) { - utils.logEvent(word, language, 'subentries length is greater than 1', { data }); - } - - if (entry.sense_families) { - utils.logEvent(word, language, 'entry has subentries and sense families', { data }); - } - - if (entry.etymology) { - utils.logEvent(word, language, 'entry has subentries and etymology', { data }); - } - - mappedSubentries = subentries - .map((subentry) => { - if (subentry.sense_families) { - utils.logEvent(word, language, 'subentry has sense families', { data }); - } - - if (subentry.sense_family) { - subentry.sense_families = []; - subentry.sense_families.push(subentry.sense_family); - } - - return _.defaults(subentry, _.pick(entry, ['phonetics', 'etymology'])) - }) - - return accumulator.concat(mappedSubentries); - }, []) - .map((entry) => { - let { headword, lemma, phonetics = [], etymology = {}, sense_families = [] } = entry; - - return { - word: lemma || headword, - phonetic: _.get(phonetics, '0.text'), - phonetics: phonetics.map((e) => { - return { - text: e.text, - audio: e.oxford_audio - }; - }), - origin: _.get(etymology, 'etymology.text'), - meanings: sense_families.map((sense_family) => { - let { parts_of_speech, senses = []} = sense_family; - - // if parts of speech is empty at this level. - // Current hypothesis tells that it means only one sense is present - // We need to take out parts_of_speech from it and use it. - if (!parts_of_speech) { - parts_of_speech = _.get(senses[0], 'parts_of_speech', []); - - if (senses.length > 1) { - utils.logEvent(word, language, 'part of speech missing but more than one sense present', { data }); - } - } - - if (parts_of_speech.length > 1) { - utils.logEvent(word, language, 'more than one part of speech present', { data }); - } - - return { - partOfSpeech: _.get(parts_of_speech[0], 'value'), - definitions: senses.map((sense) => { - let { definition = {}, example_groups = [], thesaurus_entries = [] } = sense, - result = { - definition: definition.text, - example: _.get(example_groups[0], 'examples.0'), - synonyms: _.get(thesaurus_entries[0], 'synonyms.0.nyms', []) - .map(e => e.nym), - antonyms: _.get(thesaurus_entries[0], 'antonyms.0.nyms', []) - .map(e => e.nym) - }; - - if (include.example) { - result.examples = _.reduce(example_groups, (accumulator, example_group) => { - let example = _.get(example_group, 'examples', []); - - accumulator = accumulator.concat(example); - - return accumulator; - }, []); - } - - return result; - }) - }; - }) - }; - }); -} - -async function queryInternet (word, language) { - let url = new URL('https://www.google.com/async/callback:5493'); - - url.searchParams.set('fc', 'ErUBCndBTlVfTnFUN29LdXdNSlQ2VlZoWUIwWE1HaElOclFNU29TOFF4ZGxGbV9zbzA3YmQ2NnJyQXlHNVlrb3l3OXgtREpRbXpNZ0M1NWZPeFo4NjQyVlA3S2ZQOHpYa292MFBMaDQweGRNQjR4eTlld1E4bDlCbXFJMBIWU2JzSllkLVpHc3J5OVFPb3Q2aVlDZxoiQU9NWVJ3QmU2cHRlbjZEZmw5U0lXT1lOR3hsM2xBWGFldw'); - url.searchParams.set('fcv', '3'); - url.searchParams.set('async', `term:${encodeURIComponent(word)},corpus:${language},hhdr:true,hwdgt:true,wfp:true,ttl:,tsl:,ptl:`); - - url = url.toString(); - - let response = await fetch(url, { - agent: httpsAgent, - headers: new fetch.Headers({ - "accept": "*/*", - "accept-encoding": "gzip, deflate, br", - "accept-language": "en-US,en;q=0.9", - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36" - }) + let { + meanings, + ...otherProps + } = entry; + + meanings = meanings.reduce((meanings, meaning) => { + let partOfSpeech, definitions; + + ({ + partOfSpeech, + definitions + } = meaning); + meanings[partOfSpeech] = definitions; + + return meanings; + }, {}); + + return { + ...otherProps, + meaning: meanings + }; }); - - if (response.status === 404) { throw new errors.NoDefinitionsFound({ reason: 'Website returned 404.'}); } - - if (response.status === 429) { throw new errors.RateLimitError(); } - - if (response.status !== 200) { throw new errors.NoDefinitionsFound({ reason: 'Threw non 200 status code.'}); } - - let body = await response.text(), - data = JSON.parse(body.substring(4)), - single_results = _.get(data, 'feature-callback.payload.single_results', []), - error = _.chain(single_results) - .find('widget') - .get('widget.error') - .value() - - if (single_results.length === 0) { throw new errors.NoDefinitionsFound({ word, language }); } - - if (error === 'TERM_NOT_FOUND_ERROR') { throw new errors.NoDefinitionsFound({ word, language }); } - - if (error) { throw new errors.UnexpectedError({ error }); } - - return single_results; } -async function fetchFromSource (word, language) { - let dictionaryData = await queryInternet(word, language); - - return dictionaryData; +function transformWiktionary(word, data) { + return [{ + word: word, + phonetics: [], + meanings: data.map(entry => ({ + partOfSpeech: entry.partOfSpeech.toLowerCase(), + definitions: entry.definitions.map(def => ({ + definition: def.definition, + example: def.examples && def.examples.length > 0 ? def.examples[0] : undefined + })) + })) + }]; } -async function findDefinitions (word, language, { include }) { - let dictionaryData = await fetchFromSource(word, language); - - if (_.isEmpty(dictionaryData)) { throw new errors.UnexpectedError(); } - - return transform(word, language, dictionaryData, { include }); +async function findDefinitions(word, language, { include }) { + // We strictly use en.wiktionary.org for now as it has the reliable REST API. + const candidates = _.uniq([ + word, + word.toLowerCase(), + word.charAt(0).toUpperCase() + word.slice(1), + word.toUpperCase() + ]); + + for (const candidate of candidates) { + const url = `https://en.wiktionary.org/api/rest_v1/page/definition/${encodeURIComponent(candidate)}`; + + try { + const response = await fetch(url); + + if (response.status === 200) { + const json = await response.json(); + if (json[language]) { + return transformWiktionary(candidate, json[language]); + } + } + } catch (err) { + // Ignore errors and try next candidate + console.error(`Failed to fetch for candidate: ${candidate}`, err); + } + } + + // If we reach here, no candidates worked + throw new errors.NoDefinitionsFound({ word, language }); } module.exports = { From 1fabfdd23d5502cabf0fe59a8581fa0ac6534c68 Mon Sep 17 00:00:00 2001 From: namatanda Date: Wed, 31 Dec 2025 22:47:11 +0300 Subject: [PATCH 2/4] chore: add Docker and additional file types to .gitignore --- .gitignore | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index b512c09..a06f64f 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,7 @@ -node_modules \ No newline at end of file +node_modules +*.yml +*.sh +*.json +DOCKER.md +Dockerfile +.dockerignore From 7f1149f4bb28f83e39422053a450484134de20cb Mon Sep 17 00:00:00 2001 From: namatanda Date: Wed, 31 Dec 2025 23:09:46 +0300 Subject: [PATCH 3/4] feat: Implement a hybrid dictionary data source strategy, querying Google first and falling back to Wiktionary. --- modules/dictionary.js | 196 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 187 insertions(+), 9 deletions(-) diff --git a/modules/dictionary.js b/modules/dictionary.js index 94a6e4c..ffd2b35 100644 --- a/modules/dictionary.js +++ b/modules/dictionary.js @@ -1,5 +1,12 @@ -const fetch = require('node-fetch'), - errors = require('./errors.js'); +const fs = require('fs'), + _ = require('lodash'), + https = require('https'), + fetch = require('node-fetch'), + + utils = require('./utils.js'), + errors = require('./errors.js'), + + httpsAgent = new https.Agent({ keepAlive: true }); function transformV2toV1(data) { return data.map((entry) => { @@ -27,26 +34,172 @@ function transformV2toV1(data) { }); } +// Original Google transform function +function transformGoogle(word, language, data, { include }) { + return data + .map(e => e.entry) + .filter(e => e) + .reduce((accumulator, entry) => { + if (!entry.subentries) { return accumulator.push(entry) && accumulator; } + + let { subentries } = entry, + mappedSubentries; + + if (subentries.length > 1) { + utils.logEvent(word, language, 'subentries length is greater than 1', { data }); + } + + if (entry.sense_families) { + utils.logEvent(word, language, 'entry has subentries and sense families', { data }); + } + + if (entry.etymology) { + utils.logEvent(word, language, 'entry has subentries and etymology', { data }); + } + + mappedSubentries = subentries + .map((subentry) => { + if (subentry.sense_families) { + utils.logEvent(word, language, 'subentry has sense families', { data }); + } + + if (subentry.sense_family) { + subentry.sense_families = []; + subentry.sense_families.push(subentry.sense_family); + } + + return _.defaults(subentry, _.pick(entry, ['phonetics', 'etymology'])) + }) + + return accumulator.concat(mappedSubentries); + }, []) + .map((entry) => { + let { headword, lemma, phonetics = [], etymology = {}, sense_families = [] } = entry; + + return { + word: lemma || headword, + phonetic: _.get(phonetics, '0.text'), + phonetics: phonetics.map((e) => { + return { + text: e.text, + audio: e.oxford_audio + }; + }), + origin: _.get(etymology, 'etymology.text'), + meanings: sense_families.map((sense_family) => { + let { parts_of_speech, senses = [] } = sense_family; + + if (!parts_of_speech) { + parts_of_speech = _.get(senses[0], 'parts_of_speech', []); + + if (senses.length > 1) { + utils.logEvent(word, language, 'part of speech missing but more than one sense present', { data }); + } + } + + if (parts_of_speech.length > 1) { + utils.logEvent(word, language, 'more than one part of speech present', { data }); + } + + return { + partOfSpeech: _.get(parts_of_speech[0], 'value'), + definitions: senses.map((sense) => { + let { definition = {}, example_groups = [], thesaurus_entries = [] } = sense, + result = { + definition: definition.text, + example: _.get(example_groups[0], 'examples.0'), + synonyms: _.get(thesaurus_entries[0], 'synonyms.0.nyms', []) + .map(e => e.nym), + antonyms: _.get(thesaurus_entries[0], 'antonyms.0.nyms', []) + .map(e => e.nym) + }; + + if (include.example) { + result.examples = _.reduce(example_groups, (accumulator, example_group) => { + let example = _.get(example_group, 'examples', []); + + accumulator = accumulator.concat(example); + + return accumulator; + }, []); + } + + return result; + }) + }; + }) + }; + }); +} + +// Original Google API query +async function queryGoogle(word, language) { + let url = new URL('https://www.google.com/async/callback:5493'); + + url.searchParams.set('fc', 'ErUBCndBTlVfTnFUN29LdXdNSlQ2VlZoWUIwWE1HaElOclFNU29TOFF4ZGxGbV9zbzA3YmQ2NnJyQXlHNVlrb3l3OXgtREpRbXpNZ0M1NWZPeFo4NjQyVlA3S2ZQOHpYa292MFBMaDQweGRNQjR4eTlld1E4bDlCbXFJMBIWU2JzSllkLVpHc3J5OVFPb3Q2aVlDZxoiQU9NWVJ3QmU2cHRlbjZEZmw5U0lXT1lOR3hsM2xBWGFldw'); + url.searchParams.set('fcv', '3'); + url.searchParams.set('async', `term:${encodeURIComponent(word)},corpus:${language},hhdr:true,hwdgt:true,wfp:true,ttl:,tsl:,ptl:`); + + url = url.toString(); + + let response = await fetch(url, { + agent: httpsAgent, + headers: new fetch.Headers({ + "accept": "*/*", + "accept-encoding": "gzip, deflate, br", + "accept-language": "en-US,en;q=0.9", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36" + }) + }); + + if (response.status === 404) { throw new errors.NoDefinitionsFound({ reason: 'Website returned 404.' }); } + + if (response.status === 429) { throw new errors.RateLimitError(); } + + if (response.status !== 200) { throw new Error(`Google returned status ${response.status}`); } + + let body = await response.text(), + data = JSON.parse(body.substring(4)), + single_results = _.get(data, 'feature-callback.payload.single_results', []), + error = _.chain(single_results) + .find('widget') + .get('widget.error') + .value() + + if (single_results.length === 0) { throw new errors.NoDefinitionsFound({ word, language }); } + + if (error === 'TERM_NOT_FOUND_ERROR') { throw new errors.NoDefinitionsFound({ word, language }); } + + if (error) { throw new Error(`Google returned error: ${error}`); } + + return single_results; +} + +// Wiktionary transform function function transformWiktionary(word, data) { return [{ word: word, + phonetic: '', phonetics: [], + origin: '', meanings: data.map(entry => ({ partOfSpeech: entry.partOfSpeech.toLowerCase(), definitions: entry.definitions.map(def => ({ definition: def.definition, - example: def.examples && def.examples.length > 0 ? def.examples[0] : undefined + example: def.examples && def.examples.length > 0 ? def.examples[0] : undefined, + synonyms: [], + antonyms: [] })) })) }]; } -async function findDefinitions(word, language, { include }) { - // We strictly use en.wiktionary.org for now as it has the reliable REST API. +// Wiktionary API query +async function queryWiktionary(word, language) { const candidates = _.uniq([ word, word.toLowerCase(), - word.charAt(0).toUpperCase() + word.slice(1), + word.charAt(0).toUpperCase() + word.slice(1).toLowerCase(), word.toUpperCase() ]); @@ -59,16 +212,41 @@ async function findDefinitions(word, language, { include }) { if (response.status === 200) { const json = await response.json(); if (json[language]) { - return transformWiktionary(candidate, json[language]); + return { data: json[language], word: candidate }; } } } catch (err) { // Ignore errors and try next candidate - console.error(`Failed to fetch for candidate: ${candidate}`, err); + console.error(`Wiktionary: Failed to fetch for candidate: ${candidate}`, err.message); + } + } + + return null; +} + +async function findDefinitions(word, language, { include }) { + // Strategy: Try Google first (has richer data), fallback to Wiktionary if Google fails + + // Try Google first + try { + const googleData = await queryGoogle(word, language); + if (!_.isEmpty(googleData)) { + console.log(`Using Google data for: ${word}`); + return transformGoogle(word, language, googleData, { include }); } + } catch (googleError) { + console.log(`Google failed for "${word}": ${googleError.message}, trying Wiktionary...`); + } + + // Fallback to Wiktionary + const wiktionaryResult = await queryWiktionary(word, language); + + if (wiktionaryResult) { + console.log(`Using Wiktionary data for: ${word}`); + return transformWiktionary(wiktionaryResult.word, wiktionaryResult.data); } - // If we reach here, no candidates worked + // Both sources failed throw new errors.NoDefinitionsFound({ word, language }); } From de5c5791021e502ac041ea8e1f59e957cdb853f5 Mon Sep 17 00:00:00 2001 From: namatanda Date: Wed, 31 Dec 2025 23:16:28 +0300 Subject: [PATCH 4/4] added local files to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index a06f64f..bf08dc9 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ node_modules DOCKER.md Dockerfile .dockerignore +PR_MESSAGE.md