This commit is contained in:
talorr
2026-03-27 03:36:08 +03:00
parent 8a97ce6d54
commit cda36918e8
225 changed files with 35641 additions and 0 deletions

View File

@@ -0,0 +1,12 @@
FROM node:22-bookworm-slim
WORKDIR /app
COPY package.json package-lock.json* ./
RUN npm install
COPY src ./src
EXPOSE 4010
CMD ["npm", "start"]

Binary file not shown.

1702
forecast-ocr-service/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,18 @@
{
"name": "forecast-ocr-service",
"version": "0.1.0",
"private": true,
"type": "commonjs",
"main": "src/index.js",
"scripts": {
"start": "node src/index.js",
"dev": "node --watch src/index.js"
},
"dependencies": {
"axios": "^1.8.4",
"express": "^4.21.2",
"multer": "^2.0.2",
"sharp": "^0.34.4",
"tesseract.js": "^6.0.1"
}
}

Binary file not shown.

View File

@@ -0,0 +1,43 @@
const express = require('express');
const multer = require('multer');
const { recognizeImage } = require('./ocr');
const app = express();
const upload = multer({ storage: multer.memoryStorage(), limits: { fileSize: 10 * 1024 * 1024 } });
const port = Number(process.env.PORT || 4010);
app.use(express.json({ limit: '10mb' }));
app.get('/health', (_req, res) => {
res.json({ ok: true });
});
app.post('/ocr/forecast', upload.single('image'), async (req, res) => {
try {
const imageBuffer = req.file?.buffer || null;
if (!imageBuffer) {
return res.status(400).json({
error: 'Provide multipart field image'
});
}
const result = await recognizeImage({
imageBuffer
});
return res.json({
rawForecast: result.rawForecast,
forecast: result.forecast,
cached: result.cached
});
} catch (error) {
return res.status(500).json({
error: error.message
});
}
});
app.listen(port, () => {
console.log(`Forecast OCR service listening on port ${port}`);
});

View File

@@ -0,0 +1,292 @@
const sharp = require('sharp');
const { createWorker, PSM } = require('tesseract.js');
let workerPromise = null;
const resultCache = new Map();
const CYRILLIC_WHITELIST =
'\u0410\u0411\u0412\u0413\u0414\u0415\u0401\u0416\u0417\u0418\u0419\u041a\u041b\u041c\u041d\u041e\u041f\u0420\u0421\u0422\u0423\u0424\u0425\u0426\u0427\u0428\u0429\u042a\u042b\u042c\u042d\u042e\u042f' +
'\u0430\u0431\u0432\u0433\u0434\u0435\u0451\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044a\u044b\u044c\u044d\u044e\u044f';
const LATIN_TO_CYRILLIC_LOOKALIKES = [
[/A/g, '\u0410'],
[/B/g, '\u0412'],
[/C/g, '\u0421'],
[/E/g, '\u0415'],
[/H/g, '\u041d'],
[/K/g, '\u041a'],
[/M/g, '\u041c'],
[/O/g, '\u041e'],
[/P/g, '\u0420'],
[/T/g, '\u0422'],
[/X/g, '\u0425'],
[/Y/g, '\u0423']
];
async function getWorker() {
if (!workerPromise) {
workerPromise = (async () => {
const worker = await createWorker('rus+eng', 1, {
logger: () => undefined
});
await worker.setParameters({
tessedit_pageseg_mode: String(PSM.SINGLE_BLOCK),
preserve_interword_spaces: '1',
tessedit_char_whitelist: `${CYRILLIC_WHITELIST}ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789().,+-/: `
});
return worker;
})();
}
return workerPromise;
}
function normalizeLine(line) {
let normalized = line
.replace(/^[0-9]+\s+/, '')
.replace(/\|/g, '/')
.replace(/\u2122/g, '\u041c');
for (const [pattern, replacement] of LATIN_TO_CYRILLIC_LOOKALIKES) {
normalized = normalized.replace(pattern, replacement);
}
return normalized;
}
function normalizeOcrText(value) {
if (!value) return null;
const normalized = String(value)
.replace(/\r/g, '\n')
.split('\n')
.map((line) => line.trim())
.filter(Boolean)
.map(normalizeLine)
.join(' ')
.replace(/\s+/g, ' ')
.replace(/\s+([),.:])/g, '$1')
.replace(/([(])\s+/g, '$1')
.replace(/^[A-Za-z\u0400-\u04FF0-9]\s+(?=[A-Za-z\u0400-\u04FF]{2,})/, '')
.replace(/(\d)\s*[,.]\s*(\d)/g, '$1.$2')
.trim();
return normalized || null;
}
function canonicalizeTotals(value) {
return value
.replace(/\b[t\u0442][m\u043c]\b/gi, '\u0422\u041c')
.replace(/\b[t\u0442][b\u0432]\b/gi, '\u0422\u0411')
.replace(/\b[u\u0443]\s*([0-9]+(?:\.[0-9]+)?)\b/gi, '\u0422\u041c ($1)')
.replace(/\b[o\u043e]\s*([0-9]+(?:\.[0-9]+)?)\b/gi, '\u0422\u0411 ($1)')
.replace(/\b[m\u043c]\s*([0-9]+(?:\.[0-9]+)?)\b/gi, '\u0422\u041c ($1)')
.replace(/\b[b\u0432]\s*([0-9]+(?:\.[0-9]+)?)\b/gi, '\u0422\u0411 ($1)')
.replace(/(^|[^\u0418])(\u0422[\u041c\u0411])\s*([0-9]+(?:\.[0-9]+)?)/g, '$1$2 ($3)');
}
function normalizeTeamTotalToken(token) {
if (!token) return token;
const compact = String(token)
.replace(/\s+/g, '')
.toUpperCase()
.replace(/[|!IL]/g, '1')
.replace(/Z/g, '2')
.replace(/[\u0418N]/g, '\u0418')
.replace(/[M\u041c]/g, '\u041c')
.replace(/[T\u0422]/g, '\u0422')
.replace(/[B\u0412]/g, '\u0411');
if (/^[\u0418\u041c]\u0422\u041c[12]$/.test(compact)) {
return `\u0418\u0422\u041c${compact.slice(-1)}`;
}
if (/^[\u0418\u041c]\u0422\u0411[12]$/.test(compact)) {
return `\u0418\u0422\u0411${compact.slice(-1)}`;
}
return token;
}
function canonicalizeTeamTotals(value) {
return value.replace(/(^|\s)([A-Za-z\u0400-\u04FF|!1]{3,4}[12Z])(?=\s|\(|$)/g, (match, lead, token) => {
const normalized = normalizeTeamTotalToken(token);
return `${lead}${normalized}`;
});
}
function dedupeEdgeLineValue(value) {
const match = value.match(/^\((\d+(?:\.\d+)?)\)\s+(.+?)\s+\(\1\)$/);
if (!match) {
return value;
}
return `${match[2]} (${match[1]})`;
}
function canonicalizeOutcomePrefixes(value) {
let normalized = value
.replace(/\b[c\u0441]\s+[o\u043e][t\u0442]\b/gi, '\u0441 \u041e\u0422')
.replace(/\b[\u0438u]\s+[o\u043e][t\u0442]\b/gi, '\u0438 \u041e\u0422')
.replace(/\b[c\u0441]\s+[n\u043d][e\u0435][t\u0442]\b/gi, '\u0441 \u041d\u0415\u0422')
.replace(/\b[c\u0441]\s+[\u0434d][a\u0430]\b/gi, '\u0441 \u0414\u0410');
if (/^\u041e\u0422\s+\u0422\u041c\b/.test(normalized)) {
normalized = `\u0441 ${normalized}`;
}
return normalized;
}
function canonicalizeBothTeamsToScore(value) {
return value
.replace(/\b[o\u043e][3\u0437]\b/gi, '\u041e\u0417')
.replace(/\b\u041e\u0417\s*[\u0434d][\u0430a]\b/gi, '\u041e\u0417 \u0434\u0430')
.replace(/\b\u041e\u0417\s*[\u043dh][\u0435e][\u0442t]\b/gi, '\u041e\u0417 \u043d\u0435\u0442');
}
function normalizeResultSelectionToken(token) {
if (!token) return token;
const compact = String(token)
.replace(/\s+/g, '')
.toUpperCase()
.replace(/[|!IL]/g, '1')
.replace(/Z/g, '2')
.replace(/[\u0425\u0445]/g, 'X')
.replace(/[\u041c\u043c]/g, 'M');
if (/^(?:\u041f|P)?1$/.test(compact)) return '\u041f1';
if (/^(?:\u041f|P)?2$/.test(compact)) return '\u041f2';
if (/^X$/.test(compact)) return 'X';
if (/^1X$/.test(compact)) return '1X';
if (/^X2$/.test(compact)) return 'X2';
if (/^12$/.test(compact)) return '12';
if (/^M[1M]$/.test(compact)) return '\u041f1';
if (/^M2$/.test(compact)) return '\u041f2';
return token;
}
function canonicalizeMainGameSelections(value) {
return value.replace(/(\u041e\u0441\u043d\u043e\u0432\u043d\u0430\u044f\s+\u0438\u0433\u0440\u0430\s+)([A-Za-z\u0400-\u04FF0-9|!]+)/gi, (_match, prefix, token) => {
return `${prefix}${normalizeResultSelectionToken(token)}`;
});
}
function canonicalizeResultMarket(value) {
return value
.replace(/\b[\u041fP\u0420]\s*1\b/gi, '\u041f1')
.replace(/\b[\u041fP\u0420]\s*2\b/gi, '\u041f2')
.replace(/\b[M\u041c]\s*[1M\u041c]\b/g, '\u041f1')
.replace(/\b[M\u041c]\s*2\b/g, '\u041f2')
.replace(/\b[x\u0445]\b/gi, 'X')
.replace(/\b1[x\u0445]\b/gi, '1X')
.replace(/\b[x\u0445]2\b/gi, 'X2')
.replace(/\b12\b/gi, '12');
}
function canonicalizeHandicapMarketLabels(value) {
return value
.replace(/(^|\s)(\u0424\u041e\u0420\u0410)\s*[\u041bLIl|!1](?=\s|\(|$)/gi, (_match, lead, prefix) => `${lead}${prefix}1`)
.replace(/(^|\s)(\u0424\u041e\u0420\u0410)\s*2(?=\s|\(|$)/gi, (_match, lead, prefix) => `${lead}${prefix}2`);
}
function canonicalizeHandicap(value) {
return value
.replace(/(^|\s)[\u0424F][\u041eO][\u0420P][A\u0410][\u041bLIl|!1](?=\s|\(|$)/gi, '$1\u0424\u041e\u0420\u04101')
.replace(/(^|\s)[\u0424F][\u041eO][\u0420P][A\u0410]2(?=\s|\(|$)/gi, '$1\u0424\u041e\u0420\u04102')
.replace(/\b[\u0444f]\s*1\b/gi, '\u0424\u041e\u0420\u04101')
.replace(/\b[\u0444f]\s*2\b/gi, '\u0424\u041e\u0420\u04102')
.replace(/(^|\s)\u0424\u041e\u0420\u0410([12])\s*([+-]?\d+(?:\.\d+)?)/g, '$1\u0424\u041e\u0420\u0410$2 ($3)')
.replace(/(^|\s)\u0424\u041e\u0420\u0410([12])\s*\(/g, '$1\u0424\u041e\u0420\u0410$2 (');
}
function canonicalizeForecast(value) {
if (!value) return null;
const normalized = [
canonicalizeOutcomePrefixes,
canonicalizeBothTeamsToScore,
canonicalizeMainGameSelections,
canonicalizeResultMarket,
canonicalizeHandicapMarketLabels,
canonicalizeTeamTotals,
canonicalizeTotals,
canonicalizeHandicap
].reduce((current, transform) => transform(current), value)
.replace(/\s+/g, ' ')
.replace(/\s+([),.:])/g, '$1')
.replace(/([(])\s+/g, '$1')
.replace(/^(?:[c\u0441]\s+)?\u041e\u0422\s+\u0422\u041c\b/, '\u0441 \u041e\u0422 \u0422\u041c')
.trim();
return dedupeEdgeLineValue(normalized) || null;
}
async function preprocessImage(inputBuffer) {
return sharp(inputBuffer)
.flatten({ background: '#ffffff' })
.grayscale()
.blur(0.3)
.threshold(165, { grayscale: true })
.trim()
.resize({
width: 900,
kernel: sharp.kernel.nearest,
fit: 'inside',
withoutEnlargement: false
})
.png()
.toBuffer();
}
async function recognizeBuffer(buffer) {
const preprocessed = await preprocessImage(buffer);
const worker = await getWorker();
const {
data: { text }
} = await worker.recognize(preprocessed);
const rawForecast = normalizeOcrText(text);
return {
rawForecast,
forecast: canonicalizeForecast(rawForecast)
};
}
async function recognizeImage({ imageBuffer }) {
const cacheKey = imageBuffer ? `buffer:${imageBuffer.length}:${imageBuffer.subarray(0, 32).toString('hex')}` : null;
if (cacheKey && resultCache.has(cacheKey)) {
return {
...resultCache.get(cacheKey),
cached: true
};
}
const sourceBuffer = imageBuffer || null;
if (!sourceBuffer) {
throw new Error('Image payload is missing');
}
const result = await recognizeBuffer(sourceBuffer);
if (cacheKey) {
resultCache.set(cacheKey, result);
}
return {
...result,
cached: false
};
}
module.exports = {
recognizeImage,
canonicalizeForecast
};

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB