import { AzureExtractionResult } from '@luminovo/http-client';
import { BoundingBox } from '../boundingBox';
import { generateId } from '../generateId';
import { Attribute, AttributeExtractionRule, ExtractionItem, Extractor } from '../types';

export function extractParagraphs<TAttribute extends Attribute>({
    extractionRules,
}: {
    extractionRules: AttributeExtractionRule<TAttribute>[];
}): Extractor<TAttribute> {
    return {
        generator: async function* (azureExtractionResult: AzureExtractionResult): AsyncGenerator<ExtractionItem> {
            const maxCharactersToAllowInOneParagraph = 150;
            for (const paragraph of azureExtractionResult.analyzeResult.paragraphs ?? []) {
                // very long paragraphs run the risk of matching too many unrelated attributes (e.g. soldermask and silkscreen sides)
                if (paragraph.content.length < maxCharactersToAllowInOneParagraph) {
                    yield {
                        value: {
                            content: paragraph.content,
                            boundingRegions: paragraph.boundingRegions,
                        },
                        context: undefined,
                        // Assuming 1 as paragraphs have no confidence
                        confidence: 1,
                    };
                }
            }
        },

        extractRegion: ({ value }) => {
            const { content, boundingRegions } = value;
            const { pageNumber, polygon } = boundingRegions[0];
            return [
                {
                    id: generateId({ content, boundingRegions: [{ pageNumber, polygon }] }),
                    content: content,
                    pageNumber,
                    box: BoundingBox.fromPolygon(polygon),
                    attributes: [],
                },
            ];
        },

        extractionRules,
    };
}
