Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download
Project: topocm
Views: 398
1
# This code is adapted from https://github.com/ihinojal/getyoutubecc
2
3
import urllib.request, urllib.error, urllib.parse
4
from xml.etree import ElementTree as ET
5
import json
6
from codecs import open
7
8
def get_cc(video_id, lang="en", track="", tlang="" ):
9
"""Get closed captions from YouTube and parse the XML."""
10
cc_url = ("http://youtube.com/api/timedtext?v=" + video_id + "&lang=" +
11
lang + "&name=" + track + "&tlang=" + tlang)
12
13
xml_source = urllib.request.urlopen(cc_url).read()
14
if not len(xml_source):
15
raise RuntimeError("No CC available")
16
17
utf8_parser = ET.XMLParser(encoding='utf-8')
18
return ET.fromstring(xml_source, parser=utf8_parser)
19
20
21
def xml2sjson(cc):
22
"""Translate a single language YouTube XML to EdX sjson."""
23
entries = cc.getchildren()
24
entries = [(float(i.attrib['start']), float(i.attrib['dur']),
25
i.text.replace('\n', ' ')) for i in entries]
26
entries = [(int(i * 1e3), int((i + j) * 1e3), k) for (i, j, k) in entries]
27
result = {'start': [i[0] for i in entries],
28
'end': [i[1] for i in entries],
29
'text': [i[2] for i in entries]}
30
return result
31
32
33
def write_sjson(sjson, filename):
34
with open(filename, 'w', 'utf-8') as f:
35
json.dump(sjson, f)
36
37
38
def save_youtube_cc(video_id, filename):
39
feed = get_cc(video_id)
40
sjson = xml2sjson(feed)
41
write_sjson(sjson, filename)
42
43