Skip to content

Commit 23ff5c4

Browse files
committed
parsing json and more
1 parent 15c6c5c commit 23ff5c4

File tree

4 files changed

+62
-0
lines changed

4 files changed

+62
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,7 @@ dataset*
55
node_modules
66
*.mp4
77
*.vtt
8+
transcripts/
9+
transcripts-txt/
810
videos
911
.DS_Store

prepare-piper-json.js

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import { execSync } from 'child_process';
2+
import { readFileSync, writeFileSync } from 'fs';
3+
import pkg from 'webvtt-parser';
4+
const { WebVTTParser } = pkg;
5+
import fs from 'fs';
6+
7+
const videos = ['file1', 'file2'];
8+
9+
let counter = 0;
10+
let output = '';
11+
12+
for (let video of videos) {
13+
processVideo(`videos/${video}.mp4`, 'dataset3');
14+
}
15+
16+
writeFileSync(`dataset3/metadata.csv`, output);
17+
18+
function processVideo(videoPath, outputPath) {
19+
let regex = /\[(.*?)\]/;
20+
let id = regex.exec(videoPath)[1];
21+
let raw = fs.readFileSync(`transcripts/${id}.json`, 'utf-8');
22+
let json = JSON.parse(raw);
23+
24+
const audioPath = `${outputPath}/extracted_audio-${counter}.wav`;
25+
videoToWav(videoPath, audioPath);
26+
27+
for (let i = 0; i < json.chunks.length; i++) {
28+
let chunk = json.chunks[i];
29+
sliceAudio(audioPath, chunk.timestamp[0], chunk.timestamp[1], `${outputPath}/wavs/${counter}.wav`);
30+
output += `wavs/${counter}.wav|${chunk.text}\n`;
31+
counter++;
32+
}
33+
}
34+
35+
function videoToWav(videoPath, audioPath) {
36+
const cmd = `ffmpeg -i "${videoPath}" -acodec pcm_s16le -ar 22050 -ac 1 "${audioPath}"`;
37+
execSync(cmd);
38+
}
39+
40+
function sliceAudio(audioPath, start, end, segmentPath) {
41+
const cmd = `ffmpeg -i "${audioPath}" -ss ${start} -to ${end} -c copy "${segmentPath}"`;
42+
execSync(cmd);
43+
}
File renamed without changes.

processing-transcripts.js

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import fs from 'fs';
2+
import path from 'path';
3+
4+
const dir = 'transcripts';
5+
const files = fs.readdirSync(dir);
6+
console.log(files);
7+
files.forEach((file) => {
8+
if (path.extname(file) === '.json') {
9+
const filePath = path.join(dir, file);
10+
const raw = fs.readFileSync(filePath);
11+
const json = JSON.parse(raw);
12+
const transcript = json.text;
13+
const output = path.join('transcripts-txt', `${path.basename(file, '.json')}.txt`);
14+
fs.writeFileSync(output, transcript);
15+
console.log(`Processed and saved: ${output}`);
16+
}
17+
});

0 commit comments

Comments
 (0)