http.py 15.6 KB
Newer Older
Philipp Hagemeister's avatar
Philipp Hagemeister committed
1
2
from __future__ import unicode_literals

3
import errno
4
import os
5
import socket
6
import time
7
import random
remitamine's avatar
remitamine committed
8
import re
9
10

from .common import FileDownloader
Sergey M․'s avatar
Sergey M․ committed
11
12
13
14
from ..compat import (
    compat_str,
    compat_urllib_error,
)
15
from ..utils import (
16
17
    ContentTooShortError,
    encodeFilename,
Sergey M․'s avatar
Sergey M․ committed
18
    int_or_none,
19
    sanitize_open,
20
    sanitized_Request,
21
22
23
    write_xattr,
    XAttrMetadataError,
    XAttrUnavailableError,
24
25
26
27
)


class HttpFD(FileDownloader):
28
    def real_download(self, filename, info_dict):
29
        url = info_dict['url']
Sergey M․'s avatar
Sergey M․ committed
30
31
32
33
34
35
36
37
38
39

        class DownloadContext(dict):
            __getattr__ = dict.get
            __setattr__ = dict.__setitem__
            __delattr__ = dict.__delitem__

        ctx = DownloadContext()
        ctx.filename = filename
        ctx.tmpfilename = self.temp_name(filename)
        ctx.stream = None
40
41
42

        # Do not include the Accept-Encoding header
        headers = {'Youtubedl-no-compression': 'True'}
43
44
45
        add_headers = info_dict.get('http_headers')
        if add_headers:
            headers.update(add_headers)
46

47
        is_test = self.params.get('test', False)
Sergey M․'s avatar
Sergey M․ committed
48
        chunk_size = self._TEST_FILE_SIZE if is_test else (
49
50
            info_dict.get('downloader_options', {}).get('http_chunk_size')
            or self.params.get('http_chunk_size') or 0)
51

Sergey M․'s avatar
Sergey M․ committed
52
53
        ctx.open_mode = 'wb'
        ctx.resume_len = 0
Sergey M․'s avatar
Sergey M․ committed
54
55
56
        ctx.data_len = None
        ctx.block_size = self.params.get('buffersize', 1024)
        ctx.start_time = time.time()
57
        ctx.chunk_size = None
Sergey M․'s avatar
Sergey M․ committed
58
59
60
61

        if self.params.get('continuedl', True):
            # Establish possible resume length
            if os.path.isfile(encodeFilename(ctx.tmpfilename)):
Sergey M․'s avatar
Sergey M․ committed
62
63
64
65
                ctx.resume_len = os.path.getsize(
                    encodeFilename(ctx.tmpfilename))

        ctx.is_resume = ctx.resume_len > 0
66
67
68

        count = 0
        retries = self.params.get('retries', 0)
Sergey M․'s avatar
Sergey M․ committed
69
70
71
72
73
74
75
76

        class SucceedDownload(Exception):
            pass

        class RetryDownload(Exception):
            def __init__(self, source_error):
                self.source_error = source_error

Sergey M․'s avatar
Sergey M․ committed
77
78
79
80
81
82
83
84
85
        class NextFragment(Exception):
            pass

        def set_range(req, start, end):
            range_header = 'bytes=%d-' % start
            if end:
                range_header += compat_str(end)
            req.add_header('Range', range_header)

Sergey M․'s avatar
Sergey M․ committed
86
        def establish_connection():
87
88
            ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size)
                              if not is_test and chunk_size else chunk_size)
Sergey M․'s avatar
Sergey M․ committed
89
90
91
92
            if ctx.resume_len > 0:
                range_start = ctx.resume_len
                if ctx.is_resume:
                    self.report_resuming_byte(ctx.resume_len)
Sergey M․'s avatar
Sergey M․ committed
93
                ctx.open_mode = 'ab'
94
            elif ctx.chunk_size > 0:
Sergey M․'s avatar
Sergey M․ committed
95
96
97
98
                range_start = 0
            else:
                range_start = None
            ctx.is_resume = False
99
            range_end = range_start + ctx.chunk_size - 1 if ctx.chunk_size else None
Sergey M․'s avatar
Sergey M․ committed
100
101
102
103
            if range_end and ctx.data_len is not None and range_end >= ctx.data_len:
                range_end = ctx.data_len - 1
            has_range = range_start is not None
            ctx.has_range = has_range
104
            request = sanitized_Request(url, None, headers)
Sergey M․'s avatar
Sergey M․ committed
105
106
            if has_range:
                set_range(request, range_start, range_end)
107
108
            # Establish connection
            try:
109
110
111
112
113
114
                try:
                    ctx.data = self.ydl.urlopen(request)
                except (compat_urllib_error.URLError, ) as err:
                    if isinstance(err.reason, socket.timeout):
                        raise RetryDownload(err)
                    raise err
115
116
117
118
                # When trying to resume, Content-Range HTTP header of response has to be checked
                # to match the value of requested Range HTTP header. This is due to a webservers
                # that don't support resuming and serve a whole file with no Content-Range
                # set in response despite of requested Range (see
Sergey M․'s avatar
Sergey M․ committed
119
                # https://github.com/ytdl-org/youtube-dl/issues/6057#issuecomment-126129799)
Sergey M․'s avatar
Sergey M․ committed
120
                if has_range:
Sergey M․'s avatar
Sergey M․ committed
121
                    content_range = ctx.data.headers.get('Content-Range')
122
                    if content_range:
Sergey M․'s avatar
Sergey M․ committed
123
                        content_range_m = re.search(r'bytes (\d+)-(\d+)?(?:/(\d+))?', content_range)
124
                        # Content-Range is present and matches requested Range, resume is possible
Sergey M․'s avatar
Sergey M․ committed
125
126
127
128
129
130
                        if content_range_m:
                            if range_start == int(content_range_m.group(1)):
                                content_range_end = int_or_none(content_range_m.group(2))
                                content_len = int_or_none(content_range_m.group(3))
                                accept_content_len = (
                                    # Non-chunked download
131
                                    not ctx.chunk_size
Sergey M․'s avatar
Sergey M․ committed
132
133
                                    # Chunked download and requested piece or
                                    # its part is promised to be served
134
135
                                    or content_range_end == range_end
                                    or content_len < range_end)
Sergey M․'s avatar
Sergey M․ committed
136
137
138
                                if accept_content_len:
                                    ctx.data_len = content_len
                                    return
139
140
141
                    # Content-Range is either not present or invalid. Assuming remote webserver is
                    # trying to send the whole file, resume is not possible, so wiping the local file
                    # and performing entire redownload
142
                    self.report_unable_to_resume()
Sergey M․'s avatar
Sergey M․ committed
143
144
                    ctx.resume_len = 0
                    ctx.open_mode = 'wb'
Sergey M․'s avatar
Sergey M․ committed
145
                ctx.data_len = int_or_none(ctx.data.info().get('Content-length', None))
Sergey M․'s avatar
Sergey M․ committed
146
                return
147
            except (compat_urllib_error.HTTPError, ) as err:
Sergey M․'s avatar
Sergey M․ committed
148
                if err.code == 416:
149
150
151
                    # Unable to resume (requested range not satisfiable)
                    try:
                        # Open the connection again without the range header
152
153
                        ctx.data = self.ydl.urlopen(
                            sanitized_Request(url, None, headers))
Sergey M․'s avatar
Sergey M․ committed
154
                        content_length = ctx.data.info()['Content-Length']
155
156
157
158
159
                    except (compat_urllib_error.HTTPError, ) as err:
                        if err.code < 500 or err.code >= 600:
                            raise
                    else:
                        # Examine the reported length
160
161
                        if (content_length is not None
                                and (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)):
162
163
164
165
166
167
168
                            # The file had already been fully downloaded.
                            # Explanation to the above condition: in issue #175 it was revealed that
                            # YouTube sometimes adds or removes a few bytes from the end of the file,
                            # changing the file size slightly and causing problems for some users. So
                            # I decided to implement a suggested change and consider the file
                            # completely downloaded if the file size differs less than 100 bytes from
                            # the one in the hard drive.
Sergey M․'s avatar
Sergey M․ committed
169
170
                            self.report_file_already_downloaded(ctx.filename)
                            self.try_rename(ctx.tmpfilename, ctx.filename)
171
                            self._hook_progress({
Sergey M․'s avatar
Sergey M․ committed
172
                                'filename': ctx.filename,
173
                                'status': 'finished',
Sergey M․'s avatar
Sergey M․ committed
174
175
                                'downloaded_bytes': ctx.resume_len,
                                'total_bytes': ctx.resume_len,
176
                            })
Sergey M․'s avatar
Sergey M․ committed
177
                            raise SucceedDownload()
178
179
180
                        else:
                            # The length does not match, we start the download over
                            self.report_unable_to_resume()
Sergey M․'s avatar
Sergey M․ committed
181
182
183
                            ctx.resume_len = 0
                            ctx.open_mode = 'wb'
                            return
Sergey M․'s avatar
Sergey M․ committed
184
185
186
                elif err.code < 500 or err.code >= 600:
                    # Unexpected HTTP error
                    raise
Sergey M․'s avatar
Sergey M․ committed
187
188
189
                raise RetryDownload(err)
            except socket.error as err:
                if err.errno != errno.ECONNRESET:
Philipp Hagemeister's avatar
Philipp Hagemeister committed
190
191
                    # Connection reset is no problem, just retry
                    raise
Sergey M․'s avatar
Sergey M․ committed
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
                raise RetryDownload(err)

        def download():
            data_len = ctx.data.info().get('Content-length', None)

            # Range HTTP header may be ignored/unsupported by a webserver
            # (e.g. extractor/scivee.py, extractor/bambuser.py).
            # However, for a test we still would like to download just a piece of a file.
            # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control
            # block size when downloading a file.
            if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE):
                data_len = self._TEST_FILE_SIZE

            if data_len is not None:
                data_len = int(data_len) + ctx.resume_len
                min_data_len = self.params.get('min_filesize')
                max_data_len = self.params.get('max_filesize')
                if min_data_len is not None and data_len < min_data_len:
                    self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len))
                    return False
                if max_data_len is not None and data_len > max_data_len:
                    self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
                    return False
Philipp Hagemeister's avatar
Philipp Hagemeister committed
215

Sergey M․'s avatar
Sergey M․ committed
216
            byte_counter = 0 + ctx.resume_len
Sergey M․'s avatar
Sergey M․ committed
217
            block_size = ctx.block_size
Sergey M․'s avatar
Sergey M․ committed
218
            start = time.time()
219

Sergey M․'s avatar
Sergey M․ committed
220
221
222
            # measure time over whole while-loop, so slow_down() and best_block_size() work together properly
            now = None  # needed for slow_down() in the first loop run
            before = start  # start measuring
223

Sergey M․'s avatar
Sergey M․ committed
224
            def retry(e):
225
                to_stdout = ctx.tmpfilename == '-'
226
227
228
229
                if ctx.stream is not None:
                    if not to_stdout:
                        ctx.stream.close()
                    ctx.stream = None
230
                ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename))
Sergey M․'s avatar
Sergey M․ committed
231
                raise RetryDownload(e)
232

Sergey M․'s avatar
Sergey M․ committed
233
234
235
            while True:
                try:
                    # Download and write
236
                    data_block = ctx.data.read(block_size if data_len is None else min(block_size, data_len - byte_counter))
Sergey M․'s avatar
Sergey M․ committed
237
238
239
240
241
                # socket.timeout is a subclass of socket.error but may not have
                # errno set
                except socket.timeout as e:
                    retry(e)
                except socket.error as e:
242
243
244
245
246
                    # SSLError on python 2 (inherits socket.error) may have
                    # no errno set but this error message
                    if e.errno in (errno.ECONNRESET, errno.ETIMEDOUT) or getattr(e, 'message') == 'The read operation timed out':
                        retry(e)
                    raise
Sergey M․'s avatar
Sergey M․ committed
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270

                byte_counter += len(data_block)

                # exit loop when download is finished
                if len(data_block) == 0:
                    break

                # Open destination file just in time
                if ctx.stream is None:
                    try:
                        ctx.stream, ctx.tmpfilename = sanitize_open(
                            ctx.tmpfilename, ctx.open_mode)
                        assert ctx.stream is not None
                        ctx.filename = self.undo_temp_name(ctx.tmpfilename)
                        self.report_destination(ctx.filename)
                    except (OSError, IOError) as err:
                        self.report_error('unable to open for writing: %s' % str(err))
                        return False

                    if self.params.get('xattr_set_filesize', False) and data_len is not None:
                        try:
                            write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8'))
                        except (XAttrUnavailableError, XAttrMetadataError) as err:
                            self.report_error('unable to set filesize xattr: %s' % str(err))
271
272

                try:
Sergey M․'s avatar
Sergey M․ committed
273
274
275
276
                    ctx.stream.write(data_block)
                except (IOError, OSError) as err:
                    self.to_stderr('\n')
                    self.report_error('unable to write data: %s' % str(err))
277
                    return False
278

Sergey M․'s avatar
Sergey M․ committed
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
                # Apply rate limit
                self.slow_down(start, now, byte_counter - ctx.resume_len)

                # end measuring of one loop run
                now = time.time()
                after = now

                # Adjust block size
                if not self.params.get('noresizebuffer', False):
                    block_size = self.best_block_size(after - before, len(data_block))

                before = after

                # Progress message
                speed = self.calc_speed(start, now, byte_counter - ctx.resume_len)
Sergey M․'s avatar
Sergey M․ committed
294
                if ctx.data_len is None:
Sergey M․'s avatar
Sergey M․ committed
295
296
                    eta = None
                else:
Sergey M․'s avatar
Sergey M․ committed
297
                    eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len)
Sergey M․'s avatar
Sergey M․ committed
298
299
300
301

                self._hook_progress({
                    'status': 'downloading',
                    'downloaded_bytes': byte_counter,
Sergey M․'s avatar
Sergey M․ committed
302
                    'total_bytes': ctx.data_len,
Sergey M․'s avatar
Sergey M․ committed
303
304
305
306
                    'tmpfilename': ctx.tmpfilename,
                    'filename': ctx.filename,
                    'eta': eta,
                    'speed': speed,
Sergey M․'s avatar
Sergey M․ committed
307
                    'elapsed': now - ctx.start_time,
Sergey M․'s avatar
Sergey M․ committed
308
309
                })

310
                if data_len is not None and byte_counter == data_len:
Sergey M․'s avatar
Sergey M․ committed
311
312
                    break

313
            if not is_test and ctx.chunk_size and ctx.data_len is not None and byte_counter < ctx.data_len:
Sergey M․'s avatar
Sergey M․ committed
314
315
316
317
                ctx.resume_len = byte_counter
                # ctx.block_size = block_size
                raise NextFragment()

Sergey M․'s avatar
Sergey M․ committed
318
            if ctx.stream is None:
Philipp Hagemeister's avatar
Philipp Hagemeister committed
319
                self.to_stderr('\n')
Sergey M․'s avatar
Sergey M․ committed
320
                self.report_error('Did not get any data blocks')
321
                return False
Sergey M․'s avatar
Sergey M․ committed
322
323
            if ctx.tmpfilename != '-':
                ctx.stream.close()
324

Sergey M․'s avatar
Sergey M․ committed
325
326
327
328
329
            if data_len is not None and byte_counter != data_len:
                err = ContentTooShortError(byte_counter, int(data_len))
                if count <= retries:
                    retry(err)
                raise err
330

Sergey M․'s avatar
Sergey M․ committed
331
            self.try_rename(ctx.tmpfilename, ctx.filename)
332

Sergey M․'s avatar
Sergey M․ committed
333
334
335
            # Update file modification time
            if self.params.get('updatetime', True):
                info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None))
336
337
338

            self._hook_progress({
                'downloaded_bytes': byte_counter,
Sergey M․'s avatar
Sergey M․ committed
339
340
341
                'total_bytes': byte_counter,
                'filename': ctx.filename,
                'status': 'finished',
Sergey M․'s avatar
Sergey M․ committed
342
                'elapsed': time.time() - ctx.start_time,
343
344
            })

Sergey M․'s avatar
Sergey M․ committed
345
346
347
348
349
            return True

        while count <= retries:
            try:
                establish_connection()
350
                return download()
Sergey M․'s avatar
Sergey M․ committed
351
352
353
354
355
            except RetryDownload as e:
                count += 1
                if count <= retries:
                    self.report_retry(e.source_error, count, retries)
                continue
Sergey M․'s avatar
Sergey M․ committed
356
357
            except NextFragment:
                continue
Sergey M․'s avatar
Sergey M․ committed
358
359
360
361
362
            except SucceedDownload:
                return True

        self.report_error('giving up after %s retries' % retries)
        return False