feat: 使用 Gemini 边界框坐标改进文字排版

- 修改 Gemini prompt,提取文本元素的位置坐标(百分比)
- 新增 BoundingBox 和 TextElement 类型定义
- PPTX 生成使用动态位置而非硬编码固定位置
- SlideCard 显示每个文本元素的类型和位置信息

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Wei.Yu
2026-01-18 15:00:25 +08:00
parent 887c97529a
commit c61eb702a2
4 changed files with 150 additions and 64 deletions

View File

@@ -68,7 +68,7 @@ const App: React.FC = () => {
const initialSlides: ProcessedSlide[] = images.map((img, idx) => ({
id: idx + 1,
originalImage: img,
content: { title: '', bullets: [] },
content: { elements: [] },
status: 'pending'
}));
@@ -121,36 +121,50 @@ const App: React.FC = () => {
const pres = new pptxgen();
pres.layout = 'LAYOUT_16x9';
// 16:9 幻灯片尺寸: 10" x 5.625"
const SLIDE_WIDTH = 10;
const SLIDE_HEIGHT = 5.625;
processedSlides.forEach((slide) => {
const { title, bullets, notes, footer } = slide.content;
const { elements, notes } = slide.content;
const pptSlide = pres.addSlide();
// Add Background
if (slide.cleanedImage) {
pptSlide.background = { data: slide.cleanedImage };
}
// Title (Fixed Top Position)
pptSlide.addText(title || "Untitled", {
x: 0.5, y: 0.5, w: '90%', h: 1,
fontSize: 24, bold: true, color: '000000', fontFace: 'Arial'
});
// Body Content (Fixed Body Position)
if (bullets && bullets.length > 0) {
const bulletText = bullets.map(b => ({ text: b, options: { breakLine: true } }));
pptSlide.addText(bulletText, {
x: 0.5, y: 1.5, w: '90%', h: 3.5,
fontSize: 18, color: '333333', bullet: true, paraSpaceBefore: 10, valign: 'top'
});
}
// 使用动态位置添加每个文本元素
if (elements && elements.length > 0) {
elements.forEach(element => {
const { text, elementType, boundingBox, fontSize, alignment } = element;
// Footer
if (footer) {
pptSlide.addText(footer, {
x: 0.5, y: 5.2, w: '90%', h: 0.3,
fontSize: 10, color: '888888'
// 将百分比坐标转换为英寸
const x = (boundingBox.x / 100) * SLIDE_WIDTH;
const y = (boundingBox.y / 100) * SLIDE_HEIGHT;
const w = (boundingBox.width / 100) * SLIDE_WIDTH;
const h = (boundingBox.height / 100) * SLIDE_HEIGHT;
// 根据 fontSize 确定字号
const fontSizeValue = fontSize === 'large' ? 24
: fontSize === 'medium' ? 18 : 12;
// 根据元素类型设置样式
const isBold = elementType === 'title' || elementType === 'subtitle';
const color = elementType === 'footer' || elementType === 'caption' ? '888888' : '000000';
const isBullet = elementType === 'bullet';
pptSlide.addText(text, {
x, y, w, h,
fontSize: fontSizeValue,
bold: isBold,
color,
fontFace: 'Arial',
align: alignment || 'left',
valign: 'top',
bullet: isBullet ? { type: 'bullet' } : undefined
});
});
}

View File

@@ -65,35 +65,49 @@ export const SlideCard: React.FC<SlideCardProps> = ({ slide }) => {
<span className="text-blue-600 text-sm font-medium">Analyzing layout & text...</span>
</div>
) : (
<div className="space-y-4 h-full flex flex-col">
<div className="space-y-3 h-full flex flex-col">
<div className="flex justify-between items-start">
<div>
<span className="text-xs font-bold text-blue-600 uppercase tracking-wide">Title</span>
<h3 className="text-lg font-semibold text-gray-900 leading-tight">{slide.content.title || "Untitled Slide"}</h3>
</div>
<span className="text-xs font-bold text-blue-600 uppercase tracking-wide">
{slide.content.elements?.length || 0} Text Elements Detected
</span>
{hasCleanedBg && (
<span className="inline-flex items-center rounded-full bg-green-50 px-2 py-1 text-xs font-medium text-green-700 ring-1 ring-inset ring-green-600/20">
Background Cleaned
</span>
)}
</div>
<div className="flex-1 overflow-hidden relative border rounded-md p-3 bg-gray-50">
<span className="text-xs font-bold text-gray-500 uppercase tracking-wide block mb-2">Content</span>
<ul className="list-disc pl-5 text-sm text-gray-700 space-y-1 overflow-y-auto max-h-40">
{slide.content.bullets && slide.content.bullets.length > 0 ? (
slide.content.bullets.map((bullet, idx) => (
<li key={idx}>{bullet}</li>
))
) : (
<li className="text-gray-400 italic list-none">No body text detected.</li>
)}
</ul>
<div className="flex-1 overflow-y-auto space-y-2 max-h-60">
{slide.content.elements && slide.content.elements.length > 0 ? (
slide.content.elements.map((element, idx) => (
<div key={idx} className="border rounded-md p-2 bg-gray-50">
<div className="flex items-center gap-2 mb-1">
<span className={`text-xs font-semibold px-1.5 py-0.5 rounded ${
element.elementType === 'title' ? 'bg-blue-100 text-blue-700' :
element.elementType === 'subtitle' ? 'bg-purple-100 text-purple-700' :
element.elementType === 'bullet' ? 'bg-green-100 text-green-700' :
element.elementType === 'footer' ? 'bg-gray-200 text-gray-600' :
'bg-yellow-100 text-yellow-700'
}`}>
{element.elementType}
</span>
<span className="text-xs text-gray-400">
({Math.round(element.boundingBox.x)}%, {Math.round(element.boundingBox.y)}%)
</span>
</div>
<p className={`text-sm text-gray-800 ${element.elementType === 'title' ? 'font-semibold' : ''}`}>
{element.text.length > 150 ? element.text.slice(0, 150) + '...' : element.text}
</p>
</div>
))
) : (
<div className="text-gray-400 italic text-sm">No text elements detected.</div>
)}
</div>
{slide.content.footer && (
{slide.content.notes && (
<div className="text-xs text-gray-400 border-t pt-2">
<span className="font-semibold">Footer:</span> {slide.content.footer}
<span className="font-semibold">Notes:</span> {slide.content.notes}
</div>
)}
</div>

View File

@@ -21,16 +21,26 @@ export const analyzeSlideImage = async (apiKey: string, base64Image: string): Pr
},
},
{
text: `Analyze this presentation slide image and extract the content into a structured format for recreating it in PowerPoint.
Task:
1. Extract the main 'title' of the slide.
2. Extract the body text as a list of 'bullets'. Condense multi-line paragraphs into single distinct points where appropriate.
3. If there is small text at the bottom, extract it as 'footer'.
4. If there are speaker notes (or text that looks like it belongs in the notes), extract it.
Do not describe the visual layout (e.g., "blue background"), just extract the text content.
`,
text: `Analyze this presentation slide image and extract all text elements with their positions.
For each text element on the slide, provide:
1. The exact text content
2. The element type: "title", "subtitle", "body", "bullet", "footer", "caption", or "other"
3. The bounding box as percentages of slide dimensions (0-100):
- x: horizontal position from left edge
- y: vertical position from top edge
- width: element width
- height: element height
4. Font size category: "large" (titles, >24pt), "medium" (body, 14-24pt), "small" (<14pt)
5. Text alignment: "left", "center", or "right"
Important:
- Return ALL visible text elements, not just main content
- Estimate positions visually as percentages of the slide
- Group related bullet points together as one element with line breaks
- For speaker notes (if visible), include them separately in the "notes" field
Return the data as a JSON object with an "elements" array.`,
},
],
},
@@ -39,15 +49,41 @@ export const analyzeSlideImage = async (apiKey: string, base64Image: string): Pr
responseSchema: {
type: Type.OBJECT,
properties: {
title: { type: Type.STRING },
bullets: {
type: Type.ARRAY,
items: { type: Type.STRING }
elements: {
type: Type.ARRAY,
items: {
type: Type.OBJECT,
properties: {
text: { type: Type.STRING },
elementType: {
type: Type.STRING,
enum: ["title", "subtitle", "body", "bullet", "footer", "caption", "other"]
},
boundingBox: {
type: Type.OBJECT,
properties: {
x: { type: Type.NUMBER },
y: { type: Type.NUMBER },
width: { type: Type.NUMBER },
height: { type: Type.NUMBER }
},
required: ['x', 'y', 'width', 'height']
},
fontSize: {
type: Type.STRING,
enum: ["large", "medium", "small"]
},
alignment: {
type: Type.STRING,
enum: ["left", "center", "right"]
}
},
required: ['text', 'elementType', 'boundingBox']
}
},
footer: { type: Type.STRING },
notes: { type: Type.STRING },
notes: { type: Type.STRING }
},
required: ['title', 'bullets'],
required: ['elements'],
},
},
});
@@ -60,8 +96,13 @@ export const analyzeSlideImage = async (apiKey: string, base64Image: string): Pr
} catch (error) {
console.error("Error analyzing slide with Gemini:", error);
return {
title: "Error analyzing slide",
bullets: ["Could not extract content due to an AI error."],
elements: [{
text: "Error analyzing slide",
elementType: "title",
boundingBox: { x: 5, y: 5, width: 90, height: 10 },
fontSize: "large",
alignment: "left"
}],
};
}
};

View File

@@ -1,9 +1,26 @@
export interface BoundingBox {
x: number; // 百分比 0-100
y: number;
width: number;
height: number;
}
export interface TextElement {
text: string;
elementType: 'title' | 'subtitle' | 'body' | 'bullet' | 'footer' | 'caption' | 'other';
boundingBox: BoundingBox;
fontSize?: 'large' | 'medium' | 'small';
alignment?: 'left' | 'center' | 'right';
}
export interface SlideContent {
title: string;
bullets: string[];
footer?: string;
elements: TextElement[];
notes?: string;
layoutSuggestion?: string; // e.g., 'Title and Content'
// 保留旧字段用于向后兼容
title?: string;
bullets?: string[];
footer?: string;
layoutSuggestion?: string;
}
export interface ProcessedSlide {