import re import sys from pathlib import Path from shutil import which from urllib.parse import unquote, urljoin import requests from bs4 import BeautifulSoup def get_webname(url): return unquote(url.split('?')[0].split('/')[-1]) def get(url): r = requests.get(url) r.encodings = 'shift-jis' return BeautifulSoup(r.content, 'html.parser') def get_jpeg_quality(f): from subprocess import check_output output = check_output(['magick', 'identify', '-format', '%Q;%[jpeg:sampling-factor]', f]) quality, sampling_factor = output.decode('utf8').split(';') return int(quality), sampling_factor def bytes_to_kb(bytes): bytes = int(bytes) return f'{bytes/1024:.3f} KB' def download(url_or_res, f): if isinstance(url_or_res, str): r = requests.get(url_or_res, stream=True) else: r = url_or_res with f.open('wb') as fio: for chunk in r.iter_content(chunk_size=8192): if chunk: fio.write(chunk) def get_orig(url, save_dir='.', test_mode=False, bad_file='delete'): def check_quality(f): size = f.stat().st_size q, sampling_factor = get_jpeg_quality(f) print(f'{bytes_to_kb(size)}, q{q}, {sampling_factor}') if q == 85 and sampling_factor == '2x2,1x1,1x1': return 'bad' if q > 85: return 'good' return 'not sure' save_dir = Path(save_dir) print(f'Getting {url}') web_name = get_webname(url) f = save_dir / web_name download(url, f) print(' First try: ', end='') old_quality = check_quality(f) if old_quality == 'good': f.rename(f.with_name(f'{f.stem}_orig.jpg')) print(f' Find original. Stop.') return True if old_quality == 'bad': print(f' Likely re-compressed.') else: print(f' Not sure. Try to get a different one anyway.') filesize = f.stat().st_size # cache filesize tries = 0 while True: tries += 1 r = requests.get(url, stream=True) if r.headers['Content-length'] == str(filesize): print(f' Remote is still the same size.') else: print(f' Got a different file: ', end='') savef = f.with_name(f'{f.stem}_orig.jpg') download(r, savef) new_quality = check_quality(savef) new_filesize = savef.stat().st_size if test_mode: print(f' Test mode. So keep both files.') f.rename(f.with_name(f'{f.stem}_{filesize}.jpg')) savef.rename(f.with_name(f'{f.stem}_{new_filesize}.jpg')) return True # potential results: # bad -> good, not sure -> good: keep new if old_quality == 'bad' and new_quality == 'good' \ or old_quality == 'not sure' and new_quality == 'good': print(f' New file is original. Stop and cleanup.') if bad_file == 'delete': f.unlink() elif bad_file == 'keep': f.rename(f.with_name(f'{f.stem}_bad.jpg')) elif bad_file == 'move_to_subfolder': (save_dir / 'tobedel').mkdir(exist_ok=True) f.rename(save_dir / 'tobedel' / f.name) savef.rename(f.with_name(f'{f.stem}_orig.jpg')) return True # bad -> not sure, bad -> bad, not sure -> not sure, not sure -> bad: keep both print(' Not sure which one is better. Save both. Please check yourself.') f.rename(f.with_name(f'{f.stem}_{filesize}.jpg')) savef.rename(f.with_name(f'{f.stem}_{new_filesize}.jpg')) return True if tries > 100: print(f' Failed to get a different version of {url} after 100 tries.') return True def get_image_from_photo_page(soup): if (ele := soup.find('meta', {'property': 'og:image'})) and ele.has_attr('content'): url = ele['content'] url = re.sub(r'cdn-cgi/image/[^/]+/upimg', 'upimg', url) return url else: return soup.select_one('div#main_photo img')['src'] def single(url): if re.search(r'oricon\.co\.jp/news/\d+/photo/\d+', url): img_url = get_image_from_photo_page(get(url)) elif 'contents.oricon.co.jp' in url: img_url = url else: print(f'{url}: not a valid URL for single mode.') return False get_orig(img_url) def main(url): img_url_candidates = [] if re.search(r'oricon\.co\.jp/news/', url): print(f'{url}: news type') url = re.sub(r'/news/(\d+)/*.+$', r'/news/\1/photo/1/', url) print(f'Getting image from {url}') soup = get(url) img_url_candidates.append(get_image_from_photo_page(soup)) for a in soup.select(('div.photo_slider li > a')): new_url = urljoin(url, a['href']) if new_url == url: continue print(f'Getting image from {new_url}') soup2 = get(new_url) img_url_candidates.append(get_image_from_photo_page(soup2)) elif m := re.search(r'oricon\.co\.jp/(photo|special)/\d+', url): page_type = m[1] # https://www.oricon.co.jp/special/785/ url = re.sub(r'/(photo|special)/(\d+)/*.+$', r'/\1/\2/', url) print(f'{url}: {page_type} type') img_urls = {} while True: print(f'Getting image from {url}') soup = get(url) for img in soup.find_all('img'): img_url = None for attr in ['data-original', 'src']: if img.has_attr(attr): img_url = img[attr] break assert img_url is not None if (m := re.search(r'^(.+/(?:photo|special)/img/\d+/\d+/detail/img)(\d+)(/.+$)', img_url)): if not m[3] in img_urls: img_urls[m[3]] = m[1] else: assert img_urls[m[3]] == m[1] if soup.select_one('a.pager-next'): url = urljoin(url, soup.select_one('a.pager-next')['href']) else: break for suffix, prefix in img_urls.items(): for size in [1500, 660, 480, 200, 100]: img_url = f'{prefix}{size}{suffix}' if requests.head(img_url).status_code == 200: print(f'Find a valid image: {img_url}') img_url_candidates.append(img_url) break else: print(f'{url}: not a valid URL.') return img_url_candidates = list(dict.fromkeys(img_url_candidates)) for photo_url in img_url_candidates: get_orig(photo_url) if __name__ == '__main__': # detect magick if not which('magick'): print('magick is not installed. Please install it.') sys.exit(1) if len(sys.argv) < 2: print('Usage: oricon.py ') sys.exit(1) else: if len(sys.argv) == 3 and sys.argv[1] == 'single': url = sys.argv[2] single(url) else: url = sys.argv[1] main(url)