From c0b4982e33106794683ce54490e7357296811346 Mon Sep 17 00:00:00 2001 From: SouthFox Date: Sat, 26 Aug 2023 10:42:59 +0800 Subject: [PATCH] [back/feat] clean styles, links and show images --- src/main/backend/handlers.clj | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/src/main/backend/handlers.clj b/src/main/backend/handlers.clj index 37fbda2..d28353b 100644 --- a/src/main/backend/handlers.clj +++ b/src/main/backend/handlers.clj @@ -1,6 +1,7 @@ (ns backend.handlers (:require [clojure.string :as str] [babashka.http-client :as client] + [ring.util.codec :refer [url-decode]] [cheshire.core :refer [generate-string]]) (:import [org.jsoup Jsoup])) @@ -9,16 +10,31 @@ [content] (generate-string {:content content})) +(defn clean-html + [docs] + (-> (.select docs "style[data-emotion-css~=^[a-z0-9]*$]") + (.remove)) + (-> (.select docs "figure > img") + (.remove)) + (-> (.select docs "figure > noscript") + (.tagName "div")) + (vec + (for [a (.select docs "a")] + (.attr a "href" + (url-decode + (str/replace + (.attr a "href") + "https://link.zhihu.com/?target=" "")))))) -(defn fetch-hu-post [request] + +(defn fetch-hu-post + [request] (let [id (-> request :path-params :id) - post-url (str/join ["https://zhuanlan.zhihu.com/p/" id])] - + post-url (str/join ["https://zhuanlan.zhihu.com/p/" id]) + docs (-> (client/get post-url) :body Jsoup/parse + (.getElementsByClass "Post-RichTextContainer"))] + (clean-html docs) {:status 200 :headers {"Content-Type" "application/json; charset=utf-8"} - :body (-> (client/get post-url) - :body - Jsoup/parse - (.getElementsByClass "Post-RichTextContainer") - (.toString) + :body (-> (.toString docs) (wrap-json))}))