feat: convert site to astro via codex
This commit is contained in:
@@ -0,0 +1,110 @@
|
||||
import fs from 'node:fs/promises';
|
||||
import path from 'node:path';
|
||||
import fg from 'fast-glob';
|
||||
import matter from 'gray-matter';
|
||||
|
||||
const root = path.resolve(new URL('..', import.meta.url).pathname);
|
||||
const dist = path.join(root, 'dist');
|
||||
const normalizePath = (value) => {
|
||||
const decoded = decodeURIComponent(value).replace(/\/index(?:\.html)?$/, '/').replace(/\.html$/, '');
|
||||
return decoded !== '/' ? decoded.replace(/\/$/, '') : decoded;
|
||||
};
|
||||
const contentRoute = (file, data) => {
|
||||
if (!data.slug || data.draft) return null;
|
||||
const languagePrefix = data.lang === 'en' ? '' : `/${data.lang}`;
|
||||
if (file.includes('/blog/')) return `${languagePrefix}/library/${data.slug}`;
|
||||
if (data.slug === 'index') return languagePrefix || '/';
|
||||
if (data.slug === 'library') return `${languagePrefix}/library`;
|
||||
return `${languagePrefix}/${data.slug}`;
|
||||
};
|
||||
const resolveInternalReference = (reference, route) => {
|
||||
const cleaned = reference.trim().replace(/^<|>$/g, '');
|
||||
if (!cleaned || cleaned.startsWith('#') || /^[a-z][a-z\d+.-]*:/i.test(cleaned) || cleaned.startsWith('//')) return null;
|
||||
try {
|
||||
const base = new URL(route, 'https://audit.local');
|
||||
const resolved = new URL(cleaned, base);
|
||||
return normalizePath(resolved.pathname);
|
||||
} catch {
|
||||
return cleaned;
|
||||
}
|
||||
};
|
||||
async function walk(dir) {
|
||||
const output = [];
|
||||
for (const entry of await fs.readdir(dir, { withFileTypes: true })) {
|
||||
const target = path.join(dir, entry.name);
|
||||
if (entry.isDirectory()) output.push(...await walk(target));
|
||||
else if (entry.name.endsWith('.html')) output.push(path.relative(dist, target));
|
||||
}
|
||||
return output;
|
||||
}
|
||||
const distExists = await fs.access(dist).then(() => true).catch(() => false);
|
||||
if (!distExists) {
|
||||
const routes = new Set(['/', '/library', '/es', '/es/library', '/ar', '/ar/library']);
|
||||
const contentFiles = await fg('src/content/**/*.{md,mdx}', { cwd: root, absolute: true });
|
||||
const contentRecords = [];
|
||||
for (const file of contentFiles) {
|
||||
const parsed = matter(await fs.readFile(file, 'utf8'));
|
||||
const route = contentRoute(file, parsed.data);
|
||||
if (!route) continue;
|
||||
routes.add(normalizePath(route));
|
||||
contentRecords.push({ file, route: normalizePath(route), text: parsed.content });
|
||||
}
|
||||
const publicFiles = new Set((await fg('public/**/*', { cwd: root, onlyFiles: true })).map((file) => `/${file.replace(/^public\//, '')}`));
|
||||
const broken = [];
|
||||
|
||||
for (const { file, route, text } of contentRecords) {
|
||||
if (file.includes('/pages/') && ['/library', '/es/library', '/ar/library', '/', '/es', '/ar'].includes(route)) continue;
|
||||
const references = [...text.matchAll(/\]\(([^)\s]+)(?:\s+["'][^)]*)?\)/g)].map((match) => match[1]);
|
||||
for (const rawReference of references) {
|
||||
const reference = resolveInternalReference(rawReference, route);
|
||||
if (!reference) continue;
|
||||
if (reference.startsWith('/assets/')) {
|
||||
if (!publicFiles.has(reference)) broken.push(`${path.relative(root, file)}: ${rawReference} -> ${reference}`);
|
||||
} else if (!routes.has(reference)) {
|
||||
broken.push(`${path.relative(root, file)}: ${rawReference} -> ${reference}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const sourceFiles = await fg('src/**/*.{astro,ts,js,json}', { cwd: root, absolute: true });
|
||||
for (const file of sourceFiles) {
|
||||
const text = await fs.readFile(file, 'utf8');
|
||||
const references = [
|
||||
...text.matchAll(/(?:href|src)=["'](\/[^"'#?{]*)/g),
|
||||
...text.matchAll(/\]\((\/[^)#?]*)/g)
|
||||
].map((match) => normalizePath(match[1]));
|
||||
for (const reference of references) {
|
||||
if (reference.startsWith('/assets/') || reference === '/robots.txt' || reference === '/sitemap.xml') {
|
||||
if (!publicFiles.has(reference)) broken.push(`${path.relative(root, file)}: ${reference}`);
|
||||
} else if (!routes.has(reference)) {
|
||||
broken.push(`${path.relative(root, file)}: ${reference}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
const unique = [...new Set(broken)].sort();
|
||||
const report = unique.length
|
||||
? `# Broken Links\n\nRendered output is unavailable in the sandbox; source routes and public assets were audited.\n\n${unique.map((item) => `- ${item}`).join('\n')}\n`
|
||||
: '# Broken Links\n\nRendered output is unavailable in the sandbox; source routes and public assets were audited. No broken internal source links were detected.\n';
|
||||
await fs.writeFile(path.join(root, 'reports/broken-links.md'), report);
|
||||
console.log(`${unique.length} broken internal source links.`);
|
||||
if (unique.length) process.exitCode = 1;
|
||||
process.exit();
|
||||
}
|
||||
const files = await walk(dist);
|
||||
const broken = [];
|
||||
for (const file of files) {
|
||||
const html = await fs.readFile(path.join(dist, file), 'utf8');
|
||||
for (const match of html.matchAll(/(?:href|src)="(\/[^"#?]*)/g)) {
|
||||
const url = match[1];
|
||||
if (url.startsWith('/assets/')) {
|
||||
if (!await fs.access(path.join(dist, url)).then(() => true).catch(() => false)) broken.push(`${file}: ${url}`);
|
||||
continue;
|
||||
}
|
||||
const candidates = [path.join(dist, url, 'index.html'), path.join(dist, `${url}.html`), path.join(dist, url)];
|
||||
if (!await Promise.any(candidates.map((candidate) => fs.access(candidate))).then(() => true).catch(() => false)) broken.push(`${file}: ${url}`);
|
||||
}
|
||||
}
|
||||
const report = broken.length ? `# Broken Links\n\n${broken.map((item) => `- ${item}`).join('\n')}\n` : '# Broken Links\n\nNo broken internal build links detected.\n';
|
||||
await fs.writeFile(path.join(root, 'reports/broken-links.md'), report);
|
||||
console.log(`${broken.length} broken internal links.`);
|
||||
if (broken.length) process.exitCode = 1;
|
||||
@@ -0,0 +1,14 @@
|
||||
import fs from 'node:fs/promises';
|
||||
|
||||
const start = 'https://www.azinstitute4autism.com';
|
||||
const output = new URL('../reports/live-crawl.json', import.meta.url);
|
||||
try {
|
||||
const response = await fetch(start, { redirect: 'follow' });
|
||||
const result = { url: start, status: response.status, checkedAt: new Date().toISOString() };
|
||||
await fs.writeFile(output, `${JSON.stringify(result, null, 2)}\n`);
|
||||
console.log(`Live site returned ${response.status}.`);
|
||||
} catch (error) {
|
||||
await fs.writeFile(output, `${JSON.stringify({ url: start, error: String(error) }, null, 2)}\n`);
|
||||
console.error(`Live crawl unavailable: ${error.message}`);
|
||||
process.exitCode = 1;
|
||||
}
|
||||
@@ -0,0 +1,203 @@
|
||||
import fs from 'node:fs/promises';
|
||||
import path from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
|
||||
const root = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..');
|
||||
const mirror = path.resolve(root, '../www.azinstitute4autism.com');
|
||||
const content = path.join(root, 'src/content');
|
||||
const assets = path.join(root, 'public/assets');
|
||||
const reports = path.join(root, 'reports');
|
||||
const site = 'https://www.azinstitute4autism.com';
|
||||
const mkdir = (value) => fs.mkdir(value, { recursive: true });
|
||||
const quote = (value = '') => JSON.stringify(String(value).replace(/\s+/g, ' ').trim());
|
||||
const csv = (value = '') => `"${String(value).replaceAll('"', '""')}"`;
|
||||
|
||||
async function walk(dir, prefix = '') {
|
||||
const output = [];
|
||||
for (const entry of await fs.readdir(dir, { withFileTypes: true })) {
|
||||
const relative = path.join(prefix, entry.name);
|
||||
if (entry.isDirectory()) output.push(...await walk(path.join(dir, entry.name), relative));
|
||||
else output.push(relative);
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
function logical(file) {
|
||||
return file.replace(/(?:\.html)?\?(?:hsLang=[^.]+|hs_amp=true)\.html$/, '.html');
|
||||
}
|
||||
|
||||
function selectCanonical(files) {
|
||||
const map = new Map();
|
||||
for (const file of files) {
|
||||
if (!file.endsWith('.html') || file.includes('hs_amp=true') || /\/(?:page|author)\//.test(file)) continue;
|
||||
const target = logical(file);
|
||||
const key = `${target.startsWith('ar/') ? 'ar' : target.startsWith('es/') ? 'es' : 'en'}:${target}`;
|
||||
if (!map.has(key) || (!file.includes('?') && map.get(key).includes('?'))) map.set(key, file);
|
||||
}
|
||||
return [...map.values()];
|
||||
}
|
||||
|
||||
function decode(value = '') {
|
||||
return value
|
||||
.replace(/&#(\d+);/g, (_, code) => String.fromCodePoint(Number(code)))
|
||||
.replace(/&#x([\da-f]+);/gi, (_, code) => String.fromCodePoint(parseInt(code, 16)))
|
||||
.replaceAll('&', '&').replaceAll('"', '"').replaceAll(''', "'")
|
||||
.replaceAll('<', '<').replaceAll('>', '>').replaceAll(' ', ' ');
|
||||
}
|
||||
|
||||
function text(value = '') {
|
||||
return decode(value.replace(/<[^>]+>/g, ' ')).replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
|
||||
function normalizeHref(value = '') {
|
||||
return decode(value)
|
||||
.replace(/^https?:\/\/www\.azinstitute4autism\.com/, '')
|
||||
.replace(/(?:\.html)?(?:%3F|\?)(?:hsLang=[^.#]+|hs_amp=true)(?:\.html)?$/, '')
|
||||
.replace(/\.html$/, '')
|
||||
.replace(/^index$/, '/');
|
||||
}
|
||||
|
||||
function inline(value = '') {
|
||||
return text(value.replace(/<a[^>]+href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi, (_, href, label) => {
|
||||
const cleanLabel = text(label);
|
||||
return cleanLabel ? `[${cleanLabel}](${normalizeHref(href)})` : '';
|
||||
}));
|
||||
}
|
||||
|
||||
function meta(html, name, property = false) {
|
||||
const key = property ? 'property' : 'name';
|
||||
const pattern = new RegExp(`<meta[^>]+${key}=["']${name}["'][^>]+content=["']([^"']*)`, 'i');
|
||||
const reverse = new RegExp(`<meta[^>]+content=["']([^"']*)["'][^>]+${key}=["']${name}["']`, 'i');
|
||||
return decode(html.match(pattern)?.[1] || html.match(reverse)?.[1] || '');
|
||||
}
|
||||
|
||||
function asset(value = '') {
|
||||
const match = value.match(/(?:hs-fs\/)?hubfs\/([^?#]+)/);
|
||||
return match ? `/assets/images/${path.basename(decodeURIComponent(match[1])).split('?')[0]}` : undefined;
|
||||
}
|
||||
|
||||
function markdownFrom(html) {
|
||||
let body = html.match(/<main[\s\S]*?<\/main>/i)?.[0] ||
|
||||
html.match(/blog-post__body[\s\S]*?(?=<footer|blog-post__tags|<\/article>)/i)?.[0] ||
|
||||
'';
|
||||
body = body
|
||||
.replace(/<(script|style|noscript|header|footer|nav|form)\b[\s\S]*?<\/\1>/gi, '')
|
||||
.replace(/<img[^>]+src=["']([^"']+)["'][^>]*alt=["']([^"']*)["'][^>]*>/gi, (_, src, alt) => {
|
||||
const local = asset(src);
|
||||
return local ? `\n\n\n\n` : '';
|
||||
})
|
||||
.replace(/<h([1-6])[^>]*>([\s\S]*?)<\/h\1>/gi, (_, level, value) => `\n\n${'#'.repeat(Number(level))} ${inline(value)}\n\n`)
|
||||
.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_, value) => `\n- ${inline(value)}`)
|
||||
.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, (_, value) => `\n\n${inline(value)}\n\n`)
|
||||
.replace(/<br\s*\/?>/gi, '\n')
|
||||
.replace(/<[^>]+>/g, ' ');
|
||||
return decode(body).replace(/[ \t]+/g, ' ').replace(/^\s+$/gm, '').replace(/\n{3,}/g, '\n\n').trim();
|
||||
}
|
||||
|
||||
function record(file, html) {
|
||||
const target = logical(file);
|
||||
const lang = target.startsWith('ar/') ? 'ar' : target.startsWith('es/') ? 'es' : 'en';
|
||||
const url = target === 'index.html' ? '/' : `/${target.replace(/\/index\.html$/, '').replace(/\.html$/, '')}`;
|
||||
const title = text(html.match(/<title[^>]*>([\s\S]*?)<\/title>/i)?.[1] || url);
|
||||
const h1 = text(html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i)?.[1] || title);
|
||||
const published = meta(html, 'article:published_time', true) ||
|
||||
html.match(/<time[^>]+datetime=["']([^"']+)/i)?.[1] || '';
|
||||
return {
|
||||
file, lang, url, title, h1,
|
||||
description: meta(html, 'description'),
|
||||
image: asset(meta(html, 'og:image', true)),
|
||||
date: /^\d{4}-\d{2}-\d{2}/.test(published) ? published.slice(0, 10) : '2024-01-01',
|
||||
markdown: markdownFrom(html)
|
||||
};
|
||||
}
|
||||
|
||||
function frontmatter(item, blog) {
|
||||
const slug = ['/', '/ar', '/es'].includes(item.url) ? 'index' : item.url.split('/').filter(Boolean).at(-1);
|
||||
const lines = [
|
||||
'---',
|
||||
`title: ${quote(item.title)}`,
|
||||
`description: ${quote(item.description)}`,
|
||||
`slug: ${quote(slug)}`,
|
||||
`canonical: ${quote(`${site}${item.url}`)}`,
|
||||
`lang: ${quote(item.lang)}`,
|
||||
`translationKey: ${quote(slug)}`,
|
||||
item.image ? `featuredImage: ${quote(item.image)}` : '',
|
||||
blog ? `date: "${item.date}"` : '',
|
||||
blog ? 'author: "rula-diab"' : '',
|
||||
blog ? 'category: "Library"' : '',
|
||||
blog ? 'tags: []' : '',
|
||||
'draft: false',
|
||||
'---',
|
||||
'',
|
||||
item.markdown || `# ${item.h1}`,
|
||||
''
|
||||
].filter((line) => line !== '');
|
||||
return `${lines.join('\n')}\n`;
|
||||
}
|
||||
|
||||
async function copySourceAssets(files) {
|
||||
const rows = ['source,target,size,kind'];
|
||||
const seen = new Set();
|
||||
await Promise.all(['images', 'fonts', 'downloads'].map((kind) => fs.rm(path.join(assets, kind), { recursive: true, force: true })));
|
||||
for (const file of files.filter((name) => name.startsWith('hubfs/') || name.startsWith('hs-fs/hubfs/') || name.startsWith('_hcms/googlefonts/'))) {
|
||||
const font = file.startsWith('_hcms/googlefonts/');
|
||||
const raw = font ? file.replace('_hcms/googlefonts/', '') : path.basename(file).split('?')[0];
|
||||
if (!font && !/\.(?:avif|gif|jpe?g|png|svg|webp|pdf)$/i.test(raw)) continue;
|
||||
const key = `${font}:${raw}`;
|
||||
if (!raw || seen.has(key)) continue;
|
||||
seen.add(key);
|
||||
const kind = font ? 'fonts' : raw.endsWith('.pdf') ? 'downloads' : 'images';
|
||||
const target = path.join(assets, kind, raw);
|
||||
await mkdir(path.dirname(target));
|
||||
await fs.copyFile(path.join(mirror, file), target);
|
||||
const size = (await fs.stat(target)).size;
|
||||
rows.push([file, path.relative(root, target), size, kind].map(csv).join(','));
|
||||
}
|
||||
await fs.writeFile(path.join(reports, 'asset-inventory.csv'), `${rows.join('\n')}\n`);
|
||||
}
|
||||
|
||||
async function removeMissingFeaturedImages() {
|
||||
for (const type of ['pages', 'blog']) {
|
||||
for (const file of await walk(path.join(content, type))) {
|
||||
const target = path.join(content, type, file);
|
||||
let source = await fs.readFile(target, 'utf8');
|
||||
const image = source.match(/^featuredImage:\s*"\/assets\/images\/([^"]+)"/m)?.[1];
|
||||
if (!image) continue;
|
||||
if (!await fs.access(path.join(assets, 'images', image)).then(() => true).catch(() => false)) {
|
||||
source = source.replace(/^featuredImage:.*\n/m, '');
|
||||
await fs.writeFile(target, source);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const files = await walk(mirror);
|
||||
const selected = selectCanonical(files);
|
||||
const records = [];
|
||||
for (const file of selected) records.push(record(file, await fs.readFile(path.join(mirror, file), 'utf8')));
|
||||
await Promise.all(['pages', 'blog', 'authors'].map((type) => fs.rm(path.join(content, type), { recursive: true, force: true })));
|
||||
for (const lang of ['en', 'ar', 'es']) {
|
||||
for (const type of ['pages', 'blog', 'authors']) await mkdir(path.join(content, type, lang));
|
||||
}
|
||||
for (const item of records) {
|
||||
const blog = item.url.includes('/library/') && !item.url.endsWith('/library');
|
||||
const slug = ['/', '/ar', '/es'].includes(item.url) ? 'index' : item.url.split('/').filter(Boolean).at(-1);
|
||||
await fs.writeFile(path.join(content, blog ? 'blog' : 'pages', item.lang, `${slug}.md`), frontmatter(item, blog));
|
||||
}
|
||||
for (const lang of ['en', 'ar', 'es']) {
|
||||
await fs.writeFile(path.join(content, 'authors', lang, 'rula-diab.md'), `---\nname: "Rula Diab"\nslug: "rula-diab"\ndescription: "Founder and clinical leader at Arizona Institute for Autism."\navatar: "/assets/images/rula-diab-avatar.jpg"\nlang: "${lang}"\ntranslationKey: "rula-diab"\n---\n`);
|
||||
}
|
||||
if (!records.some((item) => item.lang === 'ar' && item.url === '/ar')) {
|
||||
await fs.writeFile(path.join(content, 'pages/ar/index.md'), `---\ntitle: "معهد أريزونا للتوحد"\ndescription: "صفحة عربية تمهيدية لمعهد أريزونا للتوحد."\nslug: "index"\ncanonical: "https://www.azinstitute4autism.com/ar"\nlang: "ar"\ntranslationKey: "home"\ndraft: false\n---\n\n<!-- TODO: Replace this placeholder with reviewed Arabic content. -->\n\nتتوفر المقالات العربية الحالية في المكتبة. يرجى مراجعة المحتوى العربي قبل النشر.\n`);
|
||||
}
|
||||
const rows = ['source_file,url,language,type,title,description,h1'];
|
||||
for (const item of records) rows.push([item.file, item.url, item.lang, item.url.includes('/library/') ? 'blog' : 'page', item.title, item.description, item.h1].map(csv).join(','));
|
||||
await mkdir(reports);
|
||||
await fs.writeFile(path.join(reports, 'url-inventory.csv'), `${rows.join('\n')}\n`);
|
||||
await copySourceAssets(files);
|
||||
await removeMissingFeaturedImages();
|
||||
console.log(`Fallback extraction completed: ${records.length} canonical records.`);
|
||||
}
|
||||
|
||||
await main();
|
||||
@@ -0,0 +1,178 @@
|
||||
import fs from 'node:fs/promises';
|
||||
import path from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import fg from 'fast-glob';
|
||||
import { load } from 'cheerio';
|
||||
import TurndownService from 'turndown';
|
||||
import matter from 'gray-matter';
|
||||
|
||||
const root = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..');
|
||||
const mirror = path.resolve(root, '../www.azinstitute4autism.com');
|
||||
const site = 'https://www.azinstitute4autism.com';
|
||||
const contentRoot = path.join(root, 'src/content');
|
||||
const publicRoot = path.join(root, 'public/assets');
|
||||
const reportsRoot = path.join(root, 'reports');
|
||||
const turndown = new TurndownService({ headingStyle: 'atx', bulletListMarker: '-' });
|
||||
turndown.remove(['script', 'style', 'noscript', 'iframe']);
|
||||
|
||||
const mkdir = (dir) => fs.mkdir(dir, { recursive: true });
|
||||
const clean = (value = '') => value.replace(/\s+/g, ' ').trim();
|
||||
const csv = (value = '') => `"${String(value).replaceAll('"', '""')}"`;
|
||||
const yaml = (value = '') => JSON.stringify(String(value));
|
||||
function logicalFile(file) {
|
||||
return file.replace(/(?:\.html)?\?(?:hsLang=[^.]+|hs_amp=true)\.html$/, '.html');
|
||||
}
|
||||
|
||||
function canonicalFiles(files) {
|
||||
const selected = new Map();
|
||||
for (const file of files) {
|
||||
if (!file.endsWith('.html') || /\/(?:page|author)\//.test(file) || file.includes('hs_amp=true')) continue;
|
||||
const logical = logicalFile(file);
|
||||
const lang = logical.startsWith('ar/') ? 'ar' : logical.startsWith('es/') ? 'es' : 'en';
|
||||
const key = `${lang}:${logical}`;
|
||||
const current = selected.get(key);
|
||||
if (!current || (!file.includes('?') && current.includes('?'))) selected.set(key, file);
|
||||
}
|
||||
return [...selected.values()];
|
||||
}
|
||||
|
||||
function sourceUrl(file) {
|
||||
const logical = logicalFile(file);
|
||||
if (logical === 'index.html') return '/';
|
||||
return `/${logical.replace(/\/index\.html$/, '').replace(/\.html$/, '')}`;
|
||||
}
|
||||
|
||||
function languageFor(file, $) {
|
||||
if (file.startsWith('ar/')) return 'ar';
|
||||
if (file.startsWith('es/')) return 'es';
|
||||
return $('html').attr('lang')?.split('-')[0] || 'en';
|
||||
}
|
||||
|
||||
function localizeAsset(value = '') {
|
||||
const decoded = value.replace(/^https?:\/\/[^/]+/, '').replace(/^\.\.\//, '/');
|
||||
const match = decoded.match(/(?:\/)?(?:hs-fs\/)?hubfs\/([^?#]+)/);
|
||||
if (!match) return undefined;
|
||||
return `/assets/images/${path.basename(decodeURIComponent(match[1])).split('?')[0]}`;
|
||||
}
|
||||
|
||||
function extractRecord(file, html) {
|
||||
const $ = load(html);
|
||||
const lang = languageFor(file, $);
|
||||
const url = sourceUrl(file);
|
||||
const title = clean($('title').first().text()) || clean($('h1').first().text()) || url;
|
||||
const description = $('meta[name="description"]').attr('content') || '';
|
||||
const h1 = clean($('h1').first().text());
|
||||
const image = localizeAsset(
|
||||
$('meta[property="og:image"]').attr('content') ||
|
||||
$('.blog-post__body img, main img, .body-container-wrapper img').first().attr('src') ||
|
||||
''
|
||||
);
|
||||
const alt = clean($('.blog-post__body img, main img, .body-container-wrapper img').first().attr('alt') || '');
|
||||
const dateText = $('meta[property="article:published_time"]').attr('content') ||
|
||||
$('time').first().attr('datetime') || '2024-01-01';
|
||||
const date = /^\d{4}-\d{2}-\d{2}/.test(dateText) ? dateText.slice(0, 10) : '2024-01-01';
|
||||
const selector = file.includes('/library/') ? '.blog-post__body' : 'main, .body-container-wrapper';
|
||||
const body = $(selector).first().clone();
|
||||
body.find('header, footer, nav, form, script, style, noscript, .hs_cos_wrapper_type_form').remove();
|
||||
body.find('*').removeAttr('style').removeAttr('id').removeAttr('data-hs-cos-general-type').removeAttr('data-hs-cos-type');
|
||||
body.find('a').each((_, element) => {
|
||||
const href = $(element).attr('href');
|
||||
if (!href) return;
|
||||
$(element).attr('href', href
|
||||
.replace(/\.html(?:%3F|\?)[^"]*$/, '')
|
||||
.replace(/\.html$/, '')
|
||||
.replace(/^index$/, '/'));
|
||||
});
|
||||
body.find('img').each((_, element) => {
|
||||
const src = localizeAsset($(element).attr('src'));
|
||||
if (src) $(element).attr('src', src);
|
||||
});
|
||||
const markdown = turndown.turndown(body.html() || '').replace(/\n{3,}/g, '\n\n').trim();
|
||||
return { file, lang, url, title, description, h1, image, alt, date, markdown };
|
||||
}
|
||||
|
||||
function frontmatter(record, type) {
|
||||
const slug = ['/', '/ar', '/es'].includes(record.url) ? 'index' : record.url.split('/').filter(Boolean).at(-1);
|
||||
const data = {
|
||||
title: record.title,
|
||||
description: record.description,
|
||||
slug,
|
||||
canonical: `${site}${record.url}`,
|
||||
lang: record.lang,
|
||||
translationKey: slug,
|
||||
draft: false
|
||||
};
|
||||
if (record.image) data.featuredImage = record.image;
|
||||
if (record.alt) data.alt = record.alt;
|
||||
if (type === 'blog') Object.assign(data, {
|
||||
date: record.date,
|
||||
author: 'rula-diab',
|
||||
category: 'Library',
|
||||
tags: []
|
||||
});
|
||||
return matter.stringify(record.markdown || `# ${record.h1 || record.title}\n`, data);
|
||||
}
|
||||
|
||||
async function copyAssets() {
|
||||
const assets = await fg(['hubfs/*', 'hs-fs/hubfs/*', '_hcms/googlefonts/**/*'], { cwd: mirror, onlyFiles: true });
|
||||
const inventory = ['source,target,size,kind'];
|
||||
const seen = new Set();
|
||||
await Promise.all(['images', 'fonts', 'downloads'].map((kind) => fs.rm(path.join(publicRoot, kind), { recursive: true, force: true })));
|
||||
for (const source of assets) {
|
||||
const original = path.join(mirror, source);
|
||||
const stat = await fs.stat(original);
|
||||
const isFont = source.startsWith('_hcms/googlefonts/');
|
||||
const rawName = isFont ? source.replace('_hcms/googlefonts/', '') : path.basename(source).split('?')[0];
|
||||
if (!isFont && !/\.(?:avif|gif|jpe?g|png|svg|webp|pdf)$/i.test(rawName)) continue;
|
||||
if (!rawName || seen.has(`${isFont}:${rawName}`)) continue;
|
||||
seen.add(`${isFont}:${rawName}`);
|
||||
const target = path.join(publicRoot, isFont ? 'fonts' : rawName.endsWith('.pdf') ? 'downloads' : 'images', rawName);
|
||||
await mkdir(path.dirname(target));
|
||||
await fs.copyFile(original, target);
|
||||
inventory.push([source, path.relative(root, target), stat.size, isFont ? 'font' : path.extname(rawName).slice(1)].map(csv).join(','));
|
||||
}
|
||||
await fs.writeFile(path.join(reportsRoot, 'asset-inventory.csv'), `${inventory.join('\n')}\n`);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await Promise.all([
|
||||
mkdir(contentRoot), mkdir(publicRoot), mkdir(reportsRoot),
|
||||
...['en', 'ar', 'es'].flatMap((lang) => [
|
||||
mkdir(path.join(contentRoot, 'pages', lang)),
|
||||
mkdir(path.join(contentRoot, 'blog', lang)),
|
||||
mkdir(path.join(contentRoot, 'authors', lang))
|
||||
])
|
||||
]);
|
||||
const files = canonicalFiles(await fg(['**/*.html'], { cwd: mirror, onlyFiles: true }));
|
||||
const records = [];
|
||||
for (const file of files) records.push(extractRecord(file, await fs.readFile(path.join(mirror, file), 'utf8')));
|
||||
const corePages = new Set(records.filter((r) => !r.file.includes('/library/') && !r.file.startsWith('library/')).map((r) => r.file));
|
||||
for (const record of records) {
|
||||
const isBlog = record.file.includes('/library/') || record.file.startsWith('library/');
|
||||
if (!isBlog && !corePages.has(record.file)) continue;
|
||||
const slug = ['/', '/ar', '/es'].includes(record.url) ? 'index' : record.url.split('/').filter(Boolean).at(-1);
|
||||
const target = path.join(contentRoot, isBlog ? 'blog' : 'pages', record.lang, `${slug}.md`);
|
||||
await fs.writeFile(target, frontmatter(record, isBlog ? 'blog' : 'page'));
|
||||
}
|
||||
for (const lang of ['en', 'ar', 'es']) {
|
||||
await fs.writeFile(path.join(contentRoot, 'authors', lang, 'rula-diab.md'), matter.stringify('', {
|
||||
name: 'Rula Diab',
|
||||
slug: 'rula-diab',
|
||||
description: 'Founder and clinical leader at Arizona Institute for Autism.',
|
||||
avatar: '/assets/images/rula-diab-avatar.jpg',
|
||||
lang,
|
||||
translationKey: 'rula-diab'
|
||||
}));
|
||||
}
|
||||
const urlRows = ['source_file,url,language,type,title,description,h1'];
|
||||
for (const record of records) urlRows.push([
|
||||
record.file, record.url, record.lang,
|
||||
record.file.includes('/library/') || record.file.startsWith('library/') ? 'blog' : 'page',
|
||||
record.title, record.description, record.h1
|
||||
].map(csv).join(','));
|
||||
await fs.writeFile(path.join(reportsRoot, 'url-inventory.csv'), `${urlRows.join('\n')}\n`);
|
||||
await copyAssets();
|
||||
console.log(`Extracted ${records.length} canonical pages and posts.`);
|
||||
}
|
||||
|
||||
await main();
|
||||
@@ -0,0 +1,12 @@
|
||||
import fs from 'node:fs/promises';
|
||||
|
||||
const redirects = [
|
||||
{ from: '/aba', to: '/aba-therapy', status: 301, reason: 'Brief alias to preserved mirror URL' },
|
||||
{ from: '/autismevaluations', to: '/autism-evaluations', status: 301, reason: 'Brief alias to preserved mirror URL' },
|
||||
{ from: '/learnersocialclub', to: '/learner-social-club', status: 301, reason: 'Brief alias to preserved mirror URL' }
|
||||
];
|
||||
await fs.writeFile(new URL('../src/data/redirects.json', import.meta.url), `${JSON.stringify(redirects, null, 2)}\n`);
|
||||
await fs.writeFile(new URL('../reports/redirect-map.csv', import.meta.url),
|
||||
`from,to,status,reason\n${redirects.map((r) => `"${r.from}","${r.to}",${r.status},"${r.reason}"`).join('\n')}\n`);
|
||||
await fs.writeFile(new URL('../reports/nginx-rewrites.conf', import.meta.url),
|
||||
`${redirects.map((r) => `rewrite ^${r.from}$ ${r.to} permanent;`).join('\n')}\n`);
|
||||
@@ -0,0 +1,24 @@
|
||||
import fs from 'node:fs/promises';
|
||||
import path from 'node:path';
|
||||
|
||||
const root = path.resolve(new URL('..', import.meta.url).pathname);
|
||||
async function walk(dir) {
|
||||
const output = [];
|
||||
for (const entry of await fs.readdir(dir, { withFileTypes: true })) {
|
||||
const target = path.join(dir, entry.name);
|
||||
if (entry.isDirectory()) output.push(...await walk(target));
|
||||
else if (entry.name.endsWith('.md')) output.push(target);
|
||||
}
|
||||
return output;
|
||||
}
|
||||
const files = [...await walk(path.join(root, 'src/content/pages')), ...await walk(path.join(root, 'src/content/blog'))];
|
||||
const urls = [];
|
||||
for (const file of files) {
|
||||
const source = await fs.readFile(file, 'utf8');
|
||||
const canonical = source.match(/^canonical:\s*["']?([^"'\n]+)["']?/m)?.[1];
|
||||
const draft = source.match(/^draft:\s*(true|false)/m)?.[1] === 'true';
|
||||
if (canonical && !draft) urls.push(canonical);
|
||||
}
|
||||
const xml = `<?xml version="1.0" encoding="UTF-8"?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n${urls.sort().map((url) => ` <url><loc>${url}</loc></url>`).join('\n')}\n</urlset>\n`;
|
||||
await fs.writeFile(path.join(root, 'public/sitemap.xml'), xml);
|
||||
console.log(`Generated sitemap with ${urls.length} URLs.`);
|
||||
@@ -0,0 +1,24 @@
|
||||
// Sandbox-only validation helper: the autonomous profile cannot read /etc/hosts.
|
||||
const dns = require('node:dns');
|
||||
|
||||
const originalLookup = dns.lookup;
|
||||
dns.lookup = function lookup(hostname, options, callback) {
|
||||
if (hostname !== 'localhost') return originalLookup.apply(this, arguments);
|
||||
|
||||
if (typeof options === 'function') {
|
||||
callback = options;
|
||||
options = {};
|
||||
}
|
||||
|
||||
if (options?.all) {
|
||||
return process.nextTick(callback, null, [{ address: '127.0.0.1', family: 4 }]);
|
||||
}
|
||||
return process.nextTick(callback, null, '127.0.0.1', 4);
|
||||
};
|
||||
|
||||
const originalPromiseLookup = dns.promises.lookup;
|
||||
dns.promises.lookup = async function lookup(hostname, options) {
|
||||
if (hostname !== 'localhost') return originalPromiseLookup.call(this, hostname, options);
|
||||
if (options?.all) return [{ address: '127.0.0.1', family: 4 }];
|
||||
return { address: '127.0.0.1', family: 4 };
|
||||
};
|
||||
Reference in New Issue
Block a user