Skip to content

Commit eb21f01

Browse files
Add GitHub image metadata script (#2943)
* Add GitHub image metadata script to tasks folder Co-Authored-By: [email protected] <[email protected]> * Fix script path and add generated CSV files Co-Authored-By: [email protected] <[email protected]> * Remove CSV files from PR Co-Authored-By: [email protected] <[email protected]> * Update github_image_metadata.py --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: [email protected] <[email protected]> Co-authored-by: tore-statsig <[email protected]>
1 parent 2518d93 commit eb21f01

File tree

1 file changed

+205
-0
lines changed

1 file changed

+205
-0
lines changed

tasks/github_image_metadata.py

+205
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
# To run this script:
2+
# python3 -m venv venv
3+
# source venv/bin/activate
4+
# pip install requests pyyaml
5+
6+
import os
7+
import re
8+
import csv
9+
import yaml
10+
import requests
11+
from datetime import datetime
12+
import concurrent.futures
13+
from urllib.parse import urlparse
14+
15+
MARKDOWN_IMAGE_PATTERN = r'!\[.*?\]\((https?://[^)]+)\)'
16+
HTML_IMG_PATTERN = r'<img.*?src=["\'](https?://[^"\']+)["\']'
17+
GITHUB_EMBED_PATTERN = r'<GitHubEmbed.*?url=["\'](https?://[^"\']+)["\']'
18+
OWNER_PATTERN = r'owner:\s*([^\s]+)'
19+
20+
def find_doc_files(root='.'):
21+
"""
22+
Recursively finds documentation files (with .md or .mdx extensions) starting from the given root directory.
23+
"""
24+
doc_files = []
25+
for dirpath, dirnames, filenames in os.walk(root):
26+
for filename in filenames:
27+
if filename.endswith('.md') or filename.endswith('.mdx'):
28+
full_path = os.path.join(dirpath, filename)
29+
doc_files.append(full_path)
30+
return doc_files
31+
32+
def extract_frontmatter(file_path):
33+
"""
34+
Extracts the frontmatter from a markdown file.
35+
Returns the frontmatter as a string.
36+
"""
37+
try:
38+
with open(file_path, 'r', encoding='utf-8') as f:
39+
content = f.read()
40+
41+
if content.startswith('---'):
42+
end_index = content.find('---', 3)
43+
if end_index != -1:
44+
return content[3:end_index].strip()
45+
46+
return ""
47+
except Exception as e:
48+
print(f"Error extracting frontmatter from {file_path}: {e}")
49+
return ""
50+
51+
def extract_owner(frontmatter):
52+
"""
53+
Extracts the owner from the frontmatter.
54+
Returns the owner as a string or None if not found.
55+
"""
56+
try:
57+
try:
58+
data = yaml.safe_load(frontmatter)
59+
if data and 'keywords' in data:
60+
for keyword in data['keywords']:
61+
if isinstance(keyword, str) and keyword.startswith('owner:'):
62+
return keyword.split(':', 1)[1].strip()
63+
except Exception:
64+
pass
65+
66+
match = re.search(OWNER_PATTERN, frontmatter)
67+
if match:
68+
return match.group(1).strip()
69+
70+
return None
71+
except Exception as e:
72+
print(f"Error extracting owner: {e}")
73+
return None
74+
75+
def extract_image_urls_and_owner(file_path):
76+
"""
77+
Extracts image URLs and owner from a markdown or MDX file.
78+
Returns a list of tuples (file_path, image_url, owner).
79+
"""
80+
results = []
81+
try:
82+
with open(file_path, 'r', encoding='utf-8') as f:
83+
content = f.read()
84+
85+
frontmatter = extract_frontmatter(file_path)
86+
owner = extract_owner(frontmatter)
87+
88+
for match in re.finditer(MARKDOWN_IMAGE_PATTERN, content):
89+
image_url = match.group(1)
90+
results.append((file_path, image_url, owner))
91+
92+
for match in re.finditer(HTML_IMG_PATTERN, content):
93+
image_url = match.group(1)
94+
results.append((file_path, image_url, owner))
95+
96+
for match in re.finditer(GITHUB_EMBED_PATTERN, content):
97+
image_url = match.group(1)
98+
results.append((file_path, image_url, owner))
99+
except Exception as e:
100+
print(f"Error processing {file_path}: {e}")
101+
102+
return results
103+
104+
def is_github_cdn_link(url):
105+
"""
106+
Checks if a URL is a GitHub CDN link.
107+
"""
108+
parsed_url = urlparse(url)
109+
return (parsed_url.netloc.endswith('githubusercontent.com') or
110+
parsed_url.netloc.endswith('github.com') or
111+
'user-attachments' in url)
112+
113+
def get_last_modified(url):
114+
"""
115+
Makes an HTTP request to get the Last-Modified header for a URL.
116+
Follows redirects if necessary.
117+
Returns the Last-Modified date as a string or None if not available.
118+
"""
119+
try:
120+
if not is_github_cdn_link(url):
121+
return "Not a GitHub CDN link"
122+
123+
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
124+
125+
response = requests.get(url, headers=headers, timeout=10, allow_redirects=True, stream=True)
126+
127+
response.close()
128+
129+
if response.status_code == 200:
130+
last_modified = response.headers.get('Last-Modified')
131+
if last_modified:
132+
try:
133+
dt = datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z")
134+
return dt.strftime("%Y-%m-%d %H:%M:%S")
135+
except ValueError:
136+
return last_modified
137+
else:
138+
return "No Last-Modified header"
139+
else:
140+
return f"HTTP Error: {response.status_code}"
141+
except Exception as e:
142+
return f"Error: {str(e)}"
143+
144+
def process_file(file_path):
145+
"""
146+
Process a single file to extract image URLs, owner, and get their Last-Modified dates.
147+
Returns a list of tuples (image_url, file_path, last_modified, owner).
148+
"""
149+
results = []
150+
image_urls_and_owner = extract_image_urls_and_owner(file_path)
151+
152+
for file_path, image_url, owner in image_urls_and_owner:
153+
last_modified = get_last_modified(image_url)
154+
results.append((image_url, file_path, last_modified, owner))
155+
156+
return results
157+
158+
def main():
159+
print('Searching for documentation files with image references...')
160+
if os.path.exists('../docs'):
161+
docs_path = '../docs'
162+
elif os.path.exists('./docs'):
163+
docs_path = './docs'
164+
else:
165+
print("Error: Could not find docs directory. Make sure you're running this script from the repository root or tasks directory.")
166+
return
167+
168+
print(f"Using docs path: {docs_path}")
169+
doc_files = find_doc_files(docs_path)
170+
results = []
171+
172+
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
173+
future_to_file = {executor.submit(process_file, file_path): file_path for file_path in doc_files}
174+
175+
for future in concurrent.futures.as_completed(future_to_file):
176+
file_path = future_to_file[future]
177+
try:
178+
file_results = future.result()
179+
results.extend(file_results)
180+
print(f"Processed {file_path}, found {len(file_results)} images")
181+
except Exception as e:
182+
print(f"Error processing {file_path}: {e}")
183+
184+
github_cdn_results = [(url, path, last_modified, owner) for url, path, last_modified, owner in results
185+
if is_github_cdn_link(url) and not last_modified.startswith("HTTP Error") and not last_modified.startswith("Error")]
186+
187+
csv_path = 'github_image_metadata_with_owner.csv'
188+
with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
189+
writer = csv.writer(csvfile)
190+
writer.writerow(['Image URL', 'Docs Page', 'Last Modified', 'Owner'])
191+
writer.writerows(github_cdn_results)
192+
193+
all_csv_path = 'all_image_metadata_with_owner.csv'
194+
with open(all_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
195+
writer = csv.writer(csvfile)
196+
writer.writerow(['Image URL', 'Docs Page', 'Last Modified', 'Owner'])
197+
writer.writerows(results)
198+
199+
print(f'Found {len(results)} total images in {len(doc_files)} documentation files')
200+
print(f'Filtered to {len(github_cdn_results)} GitHub CDN images with valid Last-Modified dates')
201+
print(f'GitHub CDN images with owner saved to {csv_path}')
202+
print(f'All images with owner saved to {all_csv_path}')
203+
204+
if __name__ == '__main__':
205+
main()

0 commit comments

Comments
 (0)