init
This commit is contained in:
12
forecast-ocr-service/Dockerfile
Normal file
12
forecast-ocr-service/Dockerfile
Normal file
@@ -0,0 +1,12 @@
|
||||
FROM node:22-bookworm-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY package.json package-lock.json* ./
|
||||
RUN npm install
|
||||
|
||||
COPY src ./src
|
||||
|
||||
EXPOSE 4010
|
||||
|
||||
CMD ["npm", "start"]
|
||||
BIN
forecast-ocr-service/eng.traineddata
Normal file
BIN
forecast-ocr-service/eng.traineddata
Normal file
Binary file not shown.
1702
forecast-ocr-service/package-lock.json
generated
Normal file
1702
forecast-ocr-service/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
18
forecast-ocr-service/package.json
Normal file
18
forecast-ocr-service/package.json
Normal file
@@ -0,0 +1,18 @@
|
||||
{
|
||||
"name": "forecast-ocr-service",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"type": "commonjs",
|
||||
"main": "src/index.js",
|
||||
"scripts": {
|
||||
"start": "node src/index.js",
|
||||
"dev": "node --watch src/index.js"
|
||||
},
|
||||
"dependencies": {
|
||||
"axios": "^1.8.4",
|
||||
"express": "^4.21.2",
|
||||
"multer": "^2.0.2",
|
||||
"sharp": "^0.34.4",
|
||||
"tesseract.js": "^6.0.1"
|
||||
}
|
||||
}
|
||||
BIN
forecast-ocr-service/rus.traineddata
Normal file
BIN
forecast-ocr-service/rus.traineddata
Normal file
Binary file not shown.
43
forecast-ocr-service/src/index.js
Normal file
43
forecast-ocr-service/src/index.js
Normal file
@@ -0,0 +1,43 @@
|
||||
const express = require('express');
|
||||
const multer = require('multer');
|
||||
const { recognizeImage } = require('./ocr');
|
||||
|
||||
const app = express();
|
||||
const upload = multer({ storage: multer.memoryStorage(), limits: { fileSize: 10 * 1024 * 1024 } });
|
||||
const port = Number(process.env.PORT || 4010);
|
||||
|
||||
app.use(express.json({ limit: '10mb' }));
|
||||
|
||||
app.get('/health', (_req, res) => {
|
||||
res.json({ ok: true });
|
||||
});
|
||||
|
||||
app.post('/ocr/forecast', upload.single('image'), async (req, res) => {
|
||||
try {
|
||||
const imageBuffer = req.file?.buffer || null;
|
||||
|
||||
if (!imageBuffer) {
|
||||
return res.status(400).json({
|
||||
error: 'Provide multipart field image'
|
||||
});
|
||||
}
|
||||
|
||||
const result = await recognizeImage({
|
||||
imageBuffer
|
||||
});
|
||||
|
||||
return res.json({
|
||||
rawForecast: result.rawForecast,
|
||||
forecast: result.forecast,
|
||||
cached: result.cached
|
||||
});
|
||||
} catch (error) {
|
||||
return res.status(500).json({
|
||||
error: error.message
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
app.listen(port, () => {
|
||||
console.log(`Forecast OCR service listening on port ${port}`);
|
||||
});
|
||||
292
forecast-ocr-service/src/ocr.js
Normal file
292
forecast-ocr-service/src/ocr.js
Normal file
@@ -0,0 +1,292 @@
|
||||
const sharp = require('sharp');
|
||||
const { createWorker, PSM } = require('tesseract.js');
|
||||
|
||||
let workerPromise = null;
|
||||
const resultCache = new Map();
|
||||
|
||||
const CYRILLIC_WHITELIST =
|
||||
'\u0410\u0411\u0412\u0413\u0414\u0415\u0401\u0416\u0417\u0418\u0419\u041a\u041b\u041c\u041d\u041e\u041f\u0420\u0421\u0422\u0423\u0424\u0425\u0426\u0427\u0428\u0429\u042a\u042b\u042c\u042d\u042e\u042f' +
|
||||
'\u0430\u0431\u0432\u0433\u0434\u0435\u0451\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044a\u044b\u044c\u044d\u044e\u044f';
|
||||
|
||||
const LATIN_TO_CYRILLIC_LOOKALIKES = [
|
||||
[/A/g, '\u0410'],
|
||||
[/B/g, '\u0412'],
|
||||
[/C/g, '\u0421'],
|
||||
[/E/g, '\u0415'],
|
||||
[/H/g, '\u041d'],
|
||||
[/K/g, '\u041a'],
|
||||
[/M/g, '\u041c'],
|
||||
[/O/g, '\u041e'],
|
||||
[/P/g, '\u0420'],
|
||||
[/T/g, '\u0422'],
|
||||
[/X/g, '\u0425'],
|
||||
[/Y/g, '\u0423']
|
||||
];
|
||||
|
||||
async function getWorker() {
|
||||
if (!workerPromise) {
|
||||
workerPromise = (async () => {
|
||||
const worker = await createWorker('rus+eng', 1, {
|
||||
logger: () => undefined
|
||||
});
|
||||
|
||||
await worker.setParameters({
|
||||
tessedit_pageseg_mode: String(PSM.SINGLE_BLOCK),
|
||||
preserve_interword_spaces: '1',
|
||||
tessedit_char_whitelist: `${CYRILLIC_WHITELIST}ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789().,+-/: `
|
||||
});
|
||||
|
||||
return worker;
|
||||
})();
|
||||
}
|
||||
|
||||
return workerPromise;
|
||||
}
|
||||
|
||||
function normalizeLine(line) {
|
||||
let normalized = line
|
||||
.replace(/^[0-9]+\s+/, '')
|
||||
.replace(/\|/g, '/')
|
||||
.replace(/\u2122/g, '\u041c');
|
||||
|
||||
for (const [pattern, replacement] of LATIN_TO_CYRILLIC_LOOKALIKES) {
|
||||
normalized = normalized.replace(pattern, replacement);
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function normalizeOcrText(value) {
|
||||
if (!value) return null;
|
||||
|
||||
const normalized = String(value)
|
||||
.replace(/\r/g, '\n')
|
||||
.split('\n')
|
||||
.map((line) => line.trim())
|
||||
.filter(Boolean)
|
||||
.map(normalizeLine)
|
||||
.join(' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.replace(/\s+([),.:])/g, '$1')
|
||||
.replace(/([(])\s+/g, '$1')
|
||||
.replace(/^[A-Za-z\u0400-\u04FF0-9]\s+(?=[A-Za-z\u0400-\u04FF]{2,})/, '')
|
||||
.replace(/(\d)\s*[,.]\s*(\d)/g, '$1.$2')
|
||||
.trim();
|
||||
|
||||
return normalized || null;
|
||||
}
|
||||
|
||||
function canonicalizeTotals(value) {
|
||||
return value
|
||||
.replace(/\b[t\u0442][m\u043c]\b/gi, '\u0422\u041c')
|
||||
.replace(/\b[t\u0442][b\u0432]\b/gi, '\u0422\u0411')
|
||||
.replace(/\b[u\u0443]\s*([0-9]+(?:\.[0-9]+)?)\b/gi, '\u0422\u041c ($1)')
|
||||
.replace(/\b[o\u043e]\s*([0-9]+(?:\.[0-9]+)?)\b/gi, '\u0422\u0411 ($1)')
|
||||
.replace(/\b[m\u043c]\s*([0-9]+(?:\.[0-9]+)?)\b/gi, '\u0422\u041c ($1)')
|
||||
.replace(/\b[b\u0432]\s*([0-9]+(?:\.[0-9]+)?)\b/gi, '\u0422\u0411 ($1)')
|
||||
.replace(/(^|[^\u0418])(\u0422[\u041c\u0411])\s*([0-9]+(?:\.[0-9]+)?)/g, '$1$2 ($3)');
|
||||
}
|
||||
|
||||
function normalizeTeamTotalToken(token) {
|
||||
if (!token) return token;
|
||||
|
||||
const compact = String(token)
|
||||
.replace(/\s+/g, '')
|
||||
.toUpperCase()
|
||||
.replace(/[|!IL]/g, '1')
|
||||
.replace(/Z/g, '2')
|
||||
.replace(/[\u0418N]/g, '\u0418')
|
||||
.replace(/[M\u041c]/g, '\u041c')
|
||||
.replace(/[T\u0422]/g, '\u0422')
|
||||
.replace(/[B\u0412]/g, '\u0411');
|
||||
|
||||
if (/^[\u0418\u041c]\u0422\u041c[12]$/.test(compact)) {
|
||||
return `\u0418\u0422\u041c${compact.slice(-1)}`;
|
||||
}
|
||||
|
||||
if (/^[\u0418\u041c]\u0422\u0411[12]$/.test(compact)) {
|
||||
return `\u0418\u0422\u0411${compact.slice(-1)}`;
|
||||
}
|
||||
|
||||
return token;
|
||||
}
|
||||
|
||||
function canonicalizeTeamTotals(value) {
|
||||
return value.replace(/(^|\s)([A-Za-z\u0400-\u04FF|!1]{3,4}[12Z])(?=\s|\(|$)/g, (match, lead, token) => {
|
||||
const normalized = normalizeTeamTotalToken(token);
|
||||
return `${lead}${normalized}`;
|
||||
});
|
||||
}
|
||||
|
||||
function dedupeEdgeLineValue(value) {
|
||||
const match = value.match(/^\((\d+(?:\.\d+)?)\)\s+(.+?)\s+\(\1\)$/);
|
||||
if (!match) {
|
||||
return value;
|
||||
}
|
||||
|
||||
return `${match[2]} (${match[1]})`;
|
||||
}
|
||||
|
||||
function canonicalizeOutcomePrefixes(value) {
|
||||
let normalized = value
|
||||
.replace(/\b[c\u0441]\s+[o\u043e][t\u0442]\b/gi, '\u0441 \u041e\u0422')
|
||||
.replace(/\b[\u0438u]\s+[o\u043e][t\u0442]\b/gi, '\u0438 \u041e\u0422')
|
||||
.replace(/\b[c\u0441]\s+[n\u043d][e\u0435][t\u0442]\b/gi, '\u0441 \u041d\u0415\u0422')
|
||||
.replace(/\b[c\u0441]\s+[\u0434d][a\u0430]\b/gi, '\u0441 \u0414\u0410');
|
||||
|
||||
if (/^\u041e\u0422\s+\u0422\u041c\b/.test(normalized)) {
|
||||
normalized = `\u0441 ${normalized}`;
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function canonicalizeBothTeamsToScore(value) {
|
||||
return value
|
||||
.replace(/\b[o\u043e][3\u0437]\b/gi, '\u041e\u0417')
|
||||
.replace(/\b\u041e\u0417\s*[\u0434d][\u0430a]\b/gi, '\u041e\u0417 \u0434\u0430')
|
||||
.replace(/\b\u041e\u0417\s*[\u043dh][\u0435e][\u0442t]\b/gi, '\u041e\u0417 \u043d\u0435\u0442');
|
||||
}
|
||||
|
||||
function normalizeResultSelectionToken(token) {
|
||||
if (!token) return token;
|
||||
|
||||
const compact = String(token)
|
||||
.replace(/\s+/g, '')
|
||||
.toUpperCase()
|
||||
.replace(/[|!IL]/g, '1')
|
||||
.replace(/Z/g, '2')
|
||||
.replace(/[\u0425\u0445]/g, 'X')
|
||||
.replace(/[\u041c\u043c]/g, 'M');
|
||||
|
||||
if (/^(?:\u041f|P)?1$/.test(compact)) return '\u041f1';
|
||||
if (/^(?:\u041f|P)?2$/.test(compact)) return '\u041f2';
|
||||
if (/^X$/.test(compact)) return 'X';
|
||||
if (/^1X$/.test(compact)) return '1X';
|
||||
if (/^X2$/.test(compact)) return 'X2';
|
||||
if (/^12$/.test(compact)) return '12';
|
||||
|
||||
if (/^M[1M]$/.test(compact)) return '\u041f1';
|
||||
if (/^M2$/.test(compact)) return '\u041f2';
|
||||
|
||||
return token;
|
||||
}
|
||||
|
||||
function canonicalizeMainGameSelections(value) {
|
||||
return value.replace(/(\u041e\u0441\u043d\u043e\u0432\u043d\u0430\u044f\s+\u0438\u0433\u0440\u0430\s+)([A-Za-z\u0400-\u04FF0-9|!]+)/gi, (_match, prefix, token) => {
|
||||
return `${prefix}${normalizeResultSelectionToken(token)}`;
|
||||
});
|
||||
}
|
||||
|
||||
function canonicalizeResultMarket(value) {
|
||||
return value
|
||||
.replace(/\b[\u041fP\u0420]\s*1\b/gi, '\u041f1')
|
||||
.replace(/\b[\u041fP\u0420]\s*2\b/gi, '\u041f2')
|
||||
.replace(/\b[M\u041c]\s*[1M\u041c]\b/g, '\u041f1')
|
||||
.replace(/\b[M\u041c]\s*2\b/g, '\u041f2')
|
||||
.replace(/\b[x\u0445]\b/gi, 'X')
|
||||
.replace(/\b1[x\u0445]\b/gi, '1X')
|
||||
.replace(/\b[x\u0445]2\b/gi, 'X2')
|
||||
.replace(/\b12\b/gi, '12');
|
||||
}
|
||||
|
||||
function canonicalizeHandicapMarketLabels(value) {
|
||||
return value
|
||||
.replace(/(^|\s)(\u0424\u041e\u0420\u0410)\s*[\u041bLIl|!1](?=\s|\(|$)/gi, (_match, lead, prefix) => `${lead}${prefix}1`)
|
||||
.replace(/(^|\s)(\u0424\u041e\u0420\u0410)\s*2(?=\s|\(|$)/gi, (_match, lead, prefix) => `${lead}${prefix}2`);
|
||||
}
|
||||
|
||||
function canonicalizeHandicap(value) {
|
||||
return value
|
||||
.replace(/(^|\s)[\u0424F][\u041eO][\u0420P][A\u0410][\u041bLIl|!1](?=\s|\(|$)/gi, '$1\u0424\u041e\u0420\u04101')
|
||||
.replace(/(^|\s)[\u0424F][\u041eO][\u0420P][A\u0410]2(?=\s|\(|$)/gi, '$1\u0424\u041e\u0420\u04102')
|
||||
.replace(/\b[\u0444f]\s*1\b/gi, '\u0424\u041e\u0420\u04101')
|
||||
.replace(/\b[\u0444f]\s*2\b/gi, '\u0424\u041e\u0420\u04102')
|
||||
.replace(/(^|\s)\u0424\u041e\u0420\u0410([12])\s*([+-]?\d+(?:\.\d+)?)/g, '$1\u0424\u041e\u0420\u0410$2 ($3)')
|
||||
.replace(/(^|\s)\u0424\u041e\u0420\u0410([12])\s*\(/g, '$1\u0424\u041e\u0420\u0410$2 (');
|
||||
}
|
||||
|
||||
function canonicalizeForecast(value) {
|
||||
if (!value) return null;
|
||||
|
||||
const normalized = [
|
||||
canonicalizeOutcomePrefixes,
|
||||
canonicalizeBothTeamsToScore,
|
||||
canonicalizeMainGameSelections,
|
||||
canonicalizeResultMarket,
|
||||
canonicalizeHandicapMarketLabels,
|
||||
canonicalizeTeamTotals,
|
||||
canonicalizeTotals,
|
||||
canonicalizeHandicap
|
||||
].reduce((current, transform) => transform(current), value)
|
||||
.replace(/\s+/g, ' ')
|
||||
.replace(/\s+([),.:])/g, '$1')
|
||||
.replace(/([(])\s+/g, '$1')
|
||||
.replace(/^(?:[c\u0441]\s+)?\u041e\u0422\s+\u0422\u041c\b/, '\u0441 \u041e\u0422 \u0422\u041c')
|
||||
.trim();
|
||||
|
||||
return dedupeEdgeLineValue(normalized) || null;
|
||||
}
|
||||
|
||||
async function preprocessImage(inputBuffer) {
|
||||
return sharp(inputBuffer)
|
||||
.flatten({ background: '#ffffff' })
|
||||
.grayscale()
|
||||
.blur(0.3)
|
||||
.threshold(165, { grayscale: true })
|
||||
.trim()
|
||||
.resize({
|
||||
width: 900,
|
||||
kernel: sharp.kernel.nearest,
|
||||
fit: 'inside',
|
||||
withoutEnlargement: false
|
||||
})
|
||||
.png()
|
||||
.toBuffer();
|
||||
}
|
||||
|
||||
async function recognizeBuffer(buffer) {
|
||||
const preprocessed = await preprocessImage(buffer);
|
||||
const worker = await getWorker();
|
||||
const {
|
||||
data: { text }
|
||||
} = await worker.recognize(preprocessed);
|
||||
|
||||
const rawForecast = normalizeOcrText(text);
|
||||
|
||||
return {
|
||||
rawForecast,
|
||||
forecast: canonicalizeForecast(rawForecast)
|
||||
};
|
||||
}
|
||||
|
||||
async function recognizeImage({ imageBuffer }) {
|
||||
const cacheKey = imageBuffer ? `buffer:${imageBuffer.length}:${imageBuffer.subarray(0, 32).toString('hex')}` : null;
|
||||
if (cacheKey && resultCache.has(cacheKey)) {
|
||||
return {
|
||||
...resultCache.get(cacheKey),
|
||||
cached: true
|
||||
};
|
||||
}
|
||||
|
||||
const sourceBuffer = imageBuffer || null;
|
||||
if (!sourceBuffer) {
|
||||
throw new Error('Image payload is missing');
|
||||
}
|
||||
|
||||
const result = await recognizeBuffer(sourceBuffer);
|
||||
|
||||
if (cacheKey) {
|
||||
resultCache.set(cacheKey, result);
|
||||
}
|
||||
|
||||
return {
|
||||
...result,
|
||||
cached: false
|
||||
};
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
recognizeImage,
|
||||
canonicalizeForecast
|
||||
};
|
||||
BIN
forecast-ocr-service/tmp-forecast-handicap.png
Normal file
BIN
forecast-ocr-service/tmp-forecast-handicap.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 20 KiB |
BIN
forecast-ocr-service/tmp-forecast-test-preprocessed.png
Normal file
BIN
forecast-ocr-service/tmp-forecast-test-preprocessed.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 2.9 KiB |
BIN
forecast-ocr-service/tmp-forecast-test.png
Normal file
BIN
forecast-ocr-service/tmp-forecast-test.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 19 KiB |
Reference in New Issue
Block a user