Attachments: youtube links as transcripts

2026-05-10 21:50:14 -07:00 · 2024-08-06 00:37:31 -07:00
parent 85aed347cf
commit 67184536a6
6 changed files with 125 additions and 31 deletions
@@ -15,6 +15,7 @@ import TelegramIcon from '@mui/icons-material/Telegram';
 import TextFieldsIcon from '@mui/icons-material/TextFields';
 import TextureIcon from '@mui/icons-material/Texture';
 import WarningRoundedIcon from '@mui/icons-material/WarningRounded';
+import YouTubeIcon from '@mui/icons-material/YouTube';

 import { RenderImageURL } from '~/modules/blocks/image/RenderImageURL';

@@ -93,6 +94,8 @@ const converterTypeToIconMap: { [key in AttachmentDraftConverterType]: React.Com
  'url-page-html': HtmlIcon, // was LanguageIcon
  'url-page-null': TextureIcon,
  'url-page-image': ImageOutlinedIcon,
+  'youtube-transcript': YouTubeIcon,
+  'youtube-transcript-simple': YouTubeIcon,
  'ego-fragments-inlined': TelegramIcon,
  'unhandled': TextureIcon,
 };
@@ -1,4 +1,6 @@
 import { callBrowseFetchPage } from '~/modules/browse/browse.client';
+import { extractYoutubeVideoIDFromURL } from '~/modules/youtube/youtube.utils';
+import { youTubeFetchTranscript } from '~/modules/youtube/useYouTubeTranscript';

 import { agiCustomId, agiUuid } from '~/common/util/idUtils';
 import { htmlTableToMarkdown } from '~/common/util/htmlTableToMarkdown';
@@ -8,7 +10,7 @@ import { pdfToImageDataURLs, pdfToText } from '~/common/util/pdfUtils';
 import { createDMessageDataInlineText, createDocAttachmentFragment, DMessageAttachmentFragment, DMessageDataInline, DMessageDocPart, DVMimeType, isContentOrAttachmentFragment, isDocPart, specialContentPartToDocAttachmentFragment } from '~/common/stores/chat/chat.fragments';
 import { liveFileCreateOrThrow } from '~/common/livefile/store-live-file';

-import type { AttachmentDraft, AttachmentDraftConverter, AttachmentDraftInput, AttachmentDraftSource, DraftEgoFragmentsInputData, DraftWebInputData } from './attachment.types';
+import type { AttachmentDraft, AttachmentDraftConverter, AttachmentDraftInput, AttachmentDraftSource, DraftEgoFragmentsInputData, DraftWebInputData, DraftYouTubeInputData } from './attachment.types';
 import type { AttachmentsDraftsStore } from './store-attachment-drafts-slice';
 import { guessInputContentTypeFromMime, heuristicMimeTypeFixup, mimeTypeIsDocX, mimeTypeIsPDF, mimeTypeIsPlainText, mimeTypeIsSupportedImage, reverseLookupMimeType } from './attachment.mimetypes';
 import { imageDataToImageAttachmentFragmentViaDBlob } from './attachment.dblobs';
@@ -23,6 +25,7 @@ const PDF_IMAGE_QUALITY = 0.5;
 // internal mimes, only used to route data within us (source -> input -> converters)
 const INT_MIME_VND_AGI_EGO_FRAGMENTS = 'application/vnd.agi.ego.fragments';
 const INT_MIME_VND_AGI_WEBPAGE = 'application/vnd.agi.webpage';
+const INT_MIME_VND_AGI_YOUTUBE = 'application/vnd.agi.youtube';


 /**
@@ -59,6 +62,29 @@ export async function attachmentLoadInputAsync(source: Readonly<AttachmentDraftS
    // Download URL (page, file, ..) and attach as input
    case 'url':
      edit({ label: source.refUrl, ref: source.refUrl });
+
+      // [YouTube] user is attaching a link to a video: try to download this as a transcript rather than a webpage
+      const asYoutubeVideoId = extractYoutubeVideoIDFromURL(source.refUrl);
+      if (asYoutubeVideoId) {
+        const transcript = await youTubeFetchTranscript(asYoutubeVideoId).catch(() => null);
+        if (transcript?.videoTitle && transcript?.transcript) {
+          edit({
+            label: transcript.videoTitle,
+            input: {
+              mimeType: INT_MIME_VND_AGI_YOUTUBE,
+              data: {
+                videoId: asYoutubeVideoId,
+                videoTitle: transcript.videoTitle,
+                videoDescription: transcript.videoDescription,
+                videoThumbnailUrl: transcript.thumbnailUrl,
+                videoTranscript: transcript.transcript,
+              },
+            },
+          });
+          break;
+        }
+      }
+
      try {
        // fetch the web page
        const { title, content: { html, markdown, text }, screenshot } = await callBrowseFetchPage(
@@ -245,6 +271,12 @@ export function attachmentDefineConverters(source: AttachmentDraftSource, input:
      }
      break;

+    // YouTube: custom converters
+    case input.mimeType === INT_MIME_VND_AGI_YOUTUBE:
+      converters.push({ id: 'youtube-transcript', name: 'Video Transcript' });
+      converters.push({ id: 'youtube-transcript-simple', name: 'Video Transcript (simple)' });
+      break;
+
    // EGO
    case input.mimeType === INT_MIME_VND_AGI_EGO_FRAGMENTS:
      converters.push({ id: 'ego-fragments-inlined', name: 'Message' });
@@ -280,13 +312,17 @@ function _prepareDocData(source: AttachmentDraftSource, input: Readonly<Attachme

    // Downloaded URL as Text, Markdown, or HTML
    case 'url':
-      let pageTitle = inputMime === INT_MIME_VND_AGI_WEBPAGE ? (input.data as DraftWebInputData)?.pageTitle : undefined;
+      let pageTitle =
+        inputMime === INT_MIME_VND_AGI_WEBPAGE ? (input.data as DraftWebInputData)?.pageTitle
+          : inputMime === INT_MIME_VND_AGI_YOUTUBE ? (input.data as DraftYouTubeInputData)?.videoTitle
+            : undefined;
      if (!pageTitle)
        pageTitle = `Web page: ${source.refUrl}`;
+      const urlRefString = inputMime === INT_MIME_VND_AGI_YOUTUBE ? 'youtube-' + (input.data as DraftYouTubeInputData)?.videoId : pageTitle;
      return {
        title: pageTitle,
        caption: converterName,
-        refString: humanReadableHyphenated(pageTitle),
+        refString: humanReadableHyphenated(urlRefString),
      };

    // File of various kinds and coming from various sources
@@ -639,6 +675,22 @@ export async function attachmentPerformConversion(
        break;


+      // youtube transcript
+      case 'youtube-transcript':
+      case 'youtube-transcript-simple':
+        if (!input.data || input.mimeType !== INT_MIME_VND_AGI_YOUTUBE) {
+          console.log('Expected YouTubeInputData for youtube-transcript, got:', input.data);
+          break;
+        }
+        const youtubeData = input.data as DraftYouTubeInputData;
+        const transcriptText =
+          converter.id === 'youtube-transcript-simple' ? youtubeData.videoTranscript
+            : `**YouTube Title**: ${youtubeData.videoTitle}\n\n**YouTube Description**: ${youtubeData.videoDescription}\n\n**YouTube Transcript**:\n${youtubeData.videoTranscript}\n`;
+        const transcriptTextData = createDMessageDataInlineText(transcriptText, 'text/plain');
+        newFragments.push(createDocAttachmentFragment(title, caption, DVMimeType.TextPlain, transcriptTextData, refString, docMeta, undefined));
+        break;
+
+
      // ego: message
      case 'ego-fragments-inlined':
        if (!input.data || input.mimeType !== INT_MIME_VND_AGI_EGO_FRAGMENTS || !(input.data as DraftEgoFragmentsInputData).fragments?.length) {
@@ -67,7 +67,7 @@ export type AttachmentDraftSourceOriginDTO = 'drop' | 'paste';

 export type AttachmentDraftInput = {
  mimeType: string; // Original MIME type of the file, or application specific type
-  data: string | ArrayBuffer | DraftWebInputData | DraftEgoFragmentsInputData; // The original data of the attachment
+  data: string | ArrayBuffer | DraftWebInputData | DraftYouTubeInputData | DraftEgoFragmentsInputData; // The original data of the attachment
  dataSize?: number; // Size of the original data (for plain/simple 1:1 mime)
  altMimeType?: string; // Alternative MIME type for the input
  altData?: string; // Alternative data for the input
@@ -88,6 +88,14 @@ export type DraftWebInputData = {
  pageTitle?: string;
 }

+export type DraftYouTubeInputData = {
+  videoId: string;
+  videoTitle: string;
+  videoDescription: string;
+  videoThumbnailUrl: string;
+  videoTranscript: string;
+}
+
 export type DraftEgoFragmentsInputData = {
  fragments: DMessageFragment[];
  conversationTitle: string;
@@ -121,6 +129,7 @@ export type AttachmentDraftConverterType =
  | 'pdf-text' | 'pdf-images'
  | 'docx-to-html'
  | 'url-page-text' | 'url-page-markdown' | 'url-page-html' | 'url-page-null' | 'url-page-image'
+  | 'youtube-transcript' | 'youtube-transcript-simple'
  | 'ego-fragments-inlined'
  | 'unhandled';

@@ -5,9 +5,7 @@
 import * as React from 'react';
 import { useQuery } from '@tanstack/react-query';

-import { frontendSideFetch } from '~/common/util/clientFetchers';
-
-import { fetchYouTubeTranscript } from './youtube.fetcher';
+// import { fetchYouTubeTranscript } from './youtube.fetcher';
 import { apiAsync } from '~/common/util/trpc.client';

 // configuration
@@ -20,6 +18,15 @@ export interface YTVideoTranscript {
  thumbnailUrl: string;
 }

+export async function youTubeFetchTranscript(videoId: string) {
+  if (USE_FRONTEND_FETCH) {
+    // return fetchYouTubeTranscript(videoId, url => frontendSideFetch(url).then(res => res.text()));
+    throw new Error('Big-AGI: Browser youtube transcript download is disabled.');
+  }
+  return apiAsync.youtube.getTranscript.query({ videoId });
+}
+
+
 export function useYouTubeTranscript(videoID: string | null, onNewTranscript: (transcript: YTVideoTranscript) => void) {

  // state
@@ -29,9 +36,7 @@ export function useYouTubeTranscript(videoID: string | null, onNewTranscript: (t
  const { data, isFetching, isError, error } = useQuery({
    enabled: !!videoID,
    queryKey: ['transcript', videoID],
-    queryFn: async () => USE_FRONTEND_FETCH
-      ? fetchYouTubeTranscript(videoID!, url => frontendSideFetch(url).then(res => res.text()))
-      : apiAsync.youtube.getTranscript.query({ videoId: videoID! }),
+    queryFn: async () => youTubeFetchTranscript(videoID!),
    staleTime: Infinity,
  });

@@ -54,6 +59,7 @@ export function useYouTubeTranscript(videoID: string | null, onNewTranscript: (t
  return {
    transcript,
    isFetching,
-    isError, error,
+    isError,
+    error,
  };
 }
@@ -7,7 +7,7 @@ import { z } from 'zod';
 import { createTRPCRouter, publicProcedure } from '~/server/api/trpc.server';
 import { fetchTextOrTRPCThrow } from '~/server/api/trpc.router.fetchers';

-import { fetchYouTubeTranscript } from './youtube.fetcher';
+import { downloadYouTubeTranscript } from './youtube.server';


 const inputSchema = z.object({
@@ -24,7 +24,7 @@ export const youtubeRouter = createTRPCRouter({
    .input(inputSchema)
    .query(async ({ input }) => {
      const { videoId } = input;
-      return await fetchYouTubeTranscript(videoId, (url) => fetchTextOrTRPCThrow({ url, name: 'YouTube Transcript' }));
+      return await downloadYouTubeTranscript(videoId, (url) => fetchTextOrTRPCThrow({ url, name: 'YouTube Transcript' }));
    }),

 });
@@ -1,21 +1,7 @@
 import { z } from 'zod';

-const youtubeTranscriptionSchema = z.object({
-  wireMagic: z.literal('pb3'),
-  events: z.array(
-    z.object({
-      tStartMs: z.number(),
-      dDurationMs: z.number().optional(),
-      aAppend: z.number().optional(),
-      segs: z.array(
-        z.object({
-          utf8: z.string(),
-          tOffsetMs: z.number().optional(),
-        }),
-      ).optional(),
-    }),
-  ),
-});
+/// THIS IS NORMALLY SERVER-SIDE CODE - do not include/invoke in the frontend ///
+

 function extractFromTo(html: string, from: string, to: string, label: string): string {
  const indexStart = html.indexOf(from);
@@ -29,25 +15,44 @@ function extractFromTo(html: string, from: string, to: string, label: string): s
 interface YouTubeTranscriptData {
  videoId: string;
  videoTitle: string;
+  videoDescription: string;
  thumbnailUrl: string;
  transcript: string;
 }

+function decodeHtmlEntities(text: string): string {
+  const entities: { [key: string]: string } = {
+    '&amp;': '&',
+    '&lt;': '<',
+    '&gt;': '>',
+    '&quot;': '"',
+    '&#39;': '\'',
+    '&#x2F;': '/',
+    '&#x60;': '`',
+    '&#x3D;': '=',
+  };
+  return text.replace(/&(?:#x?[0-9a-f]+|[a-z]+);/gi, (match) =>
+    entities[match] || match,
+  );
+}

-export async function fetchYouTubeTranscript(videoId: string, fetchTextFn: (url: string) => Promise<string>): Promise<YouTubeTranscriptData> {
+export async function downloadYouTubeTranscript(videoId: string, fetchTextFn: (url: string) => Promise<string>): Promise<YouTubeTranscriptData> {

  // 1. find the captions URL within the video HTML page
  const html = await fetchTextFn(`https://www.youtube.com/watch?v=${videoId}`);

  const captionsUrlEnc = extractFromTo(html, 'https://www.youtube.com/api/timedtext', '"', 'Captions URL');
  const captionsUrl = decodeURIComponent(captionsUrlEnc.replaceAll('\\u0026', '&'));
+
  const thumbnailUrl = extractFromTo(html, 'https://i.ytimg.com/vi/', '"', 'Thumbnail URL').replaceAll('maxres', 'hq');
-  const videoTitle = extractFromTo(html, '<title>', '</title>', 'Video Title').slice(7).replaceAll(' - YouTube', '').trim();
+  const videoTitle = decodeHtmlEntities(extractFromTo(html, '<title>', '</title>', 'Video Title').slice(7).replaceAll(' - YouTube', '').trim());
+  const videoDescription = extractFromTo(html, ',"shortDescription":"', '","', 'Video Description').slice(21);

  // 2. fetch the captions
  // note: the desktop player appends this much: &fmt=json3&xorb=2&xobt=3&xovt=3&cbr=Chrome&cbrver=114.0.0.0&c=WEB&cver=2.20230628.07.00&cplayer=UNIPLAYER&cos=Windows&cosver=10.0&cplatform=DESKTOP
  const captions = await fetchTextFn(captionsUrl + `&fmt=json3`);

+  // parse json
  let captionsJson: any;
  try {
    captionsJson = JSON.parse(captions);
@@ -55,6 +60,24 @@ export async function fetchYouTubeTranscript(videoId: string, fetchTextFn: (url:
    console.error(e);
    throw new Error('[YouTube API Issue] Could not parse the captions');
  }
+
+  // validate object
+  const youtubeTranscriptionSchema = z.object({
+    wireMagic: z.literal('pb3'),
+    events: z.array(
+      z.object({
+        tStartMs: z.number(),
+        dDurationMs: z.number().optional(),
+        aAppend: z.number().optional(),
+        segs: z.array(
+          z.object({
+            utf8: z.string(),
+            tOffsetMs: z.number().optional(),
+          }),
+        ).optional(),
+      }),
+    ),
+  });
  const safeData = youtubeTranscriptionSchema.safeParse(captionsJson);
  if (!safeData.success) {
    console.error(safeData.error);
@@ -70,6 +93,7 @@ export async function fetchYouTubeTranscript(videoId: string, fetchTextFn: (url:
  return {
    videoId,
    videoTitle,
+    videoDescription,
    thumbnailUrl,
    transcript,
  };