Rework some Rich Media functionality for better error handling

Oban should not retry jobs that are likely to fail again
This commit is contained in:
Mark Felder 2024-07-17 12:40:07 -04:00
parent b4c5cc39f6
commit d124d8645e
8 changed files with 70 additions and 50 deletions

View file

@ -0,0 +1 @@
Prevent Rich Media backfill jobs from retrying in cases where it is likely they will fail again.

View file

@ -4,6 +4,7 @@
defmodule Pleroma.Web.RichMedia.Backfill do
alias Pleroma.Web.RichMedia.Card
alias Pleroma.Web.RichMedia.Helpers
alias Pleroma.Web.RichMedia.Parser
alias Pleroma.Web.RichMedia.Parser.TTL
alias Pleroma.Workers.RichMediaWorker
@ -16,8 +17,7 @@ defmodule Pleroma.Web.RichMedia.Backfill do
Pleroma.Web.ActivityPub.ActivityPub
)
@spec run(map()) ::
:ok | {:error, {:invalid_metadata, any()} | :body_too_large | {:content, any()} | any()}
@spec run(map()) :: :ok | Parser.parse_errors() | Helpers.get_errors()
def run(%{"url" => url} = args) do
url_hash = Card.url_to_hash(url)
@ -33,22 +33,16 @@ def run(%{"url" => url} = args) do
end
warm_cache(url_hash, card)
:ok
{:error, {:invalid_metadata, fields}} ->
Logger.debug("Rich media incomplete or invalid metadata for #{url}: #{inspect(fields)}")
{:error, type} = error
when type in [:invalid_metadata, :body_too_large, :content_type, :validate] ->
negative_cache(url_hash)
error
{:error, :body_too_large} ->
Logger.error("Rich media error for #{url}: :body_too_large")
negative_cache(url_hash)
{:error, {:content_type, type}} ->
Logger.debug("Rich media error for #{url}: :content_type is #{type}")
negative_cache(url_hash)
e ->
Logger.debug("Rich media error for #{url}: #{inspect(e)}")
{:error, e}
{:error, type} = error
when type in [:get, :head] ->
error
end
end

View file

@ -5,26 +5,38 @@
defmodule Pleroma.Web.RichMedia.Helpers do
alias Pleroma.Config
require Logger
@type get_errors :: {:error, :body_too_large | :content_type | :head | :get}
@spec rich_media_get(String.t()) :: {:ok, String.t()} | get_errors()
def rich_media_get(url) do
headers = [{"user-agent", Pleroma.Application.user_agent() <> "; Bot"}]
head_check =
case Pleroma.HTTP.head(url, headers, http_options()) do
# If the HEAD request didn't reach the server for whatever reason,
# we assume the GET that comes right after won't either
{:error, _} = e ->
e
with {_, {:ok, %Tesla.Env{status: 200, headers: headers}}} <-
{:head, Pleroma.HTTP.head(url, headers, http_options())},
{_, :ok} <- {:content_type, check_content_type(headers)},
{_, :ok} <- {:content_length, check_content_length(headers)},
{_, {:ok, %Tesla.Env{status: 200, body: body}}} <-
{:get, Pleroma.HTTP.get(url, headers, http_options())} do
{:ok, body}
else
{:head, _} ->
Logger.debug("Rich media error for #{url}: HTTP HEAD failed")
{:error, :head}
{:ok, %Tesla.Env{status: 200, headers: headers}} ->
with :ok <- check_content_type(headers),
:ok <- check_content_length(headers),
do: :ok
{:content_type, {_, type}} ->
Logger.debug("Rich media error for #{url}: content-type is #{type}")
{:error, :content_type}
_ ->
:ok
end
{:content_length, {_, length}} ->
Logger.debug("Rich media error for #{url}: content-length is #{length}")
{:error, :body_too_large}
with :ok <- head_check, do: Pleroma.HTTP.get(url, headers, http_options())
{:get, _} ->
Logger.debug("Rich media error for #{url}: HTTP GET failed")
{:error, :get}
end
end
defp check_content_type(headers) do
@ -32,7 +44,7 @@ defp check_content_type(headers) do
{_, content_type} ->
case Plug.Conn.Utils.media_type(content_type) do
{:ok, "text", "html", _} -> :ok
_ -> {:error, {:content_type, content_type}}
_ -> {:error, content_type}
end
_ ->
@ -47,7 +59,7 @@ defp check_content_length(headers) do
{_, maybe_content_length} ->
case Integer.parse(maybe_content_length) do
{content_length, ""} when content_length <= max_body -> :ok
{_, ""} -> {:error, :body_too_large}
{_, ""} -> {:error, maybe_content_length}
_ -> :ok
end

View file

@ -3,6 +3,7 @@
# SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.Web.RichMedia.Parser do
alias Pleroma.Web.RichMedia.Helpers
require Logger
@config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)
@ -11,24 +12,26 @@ defp parsers do
Pleroma.Config.get([:rich_media, :parsers])
end
def parse(nil), do: nil
@type parse_errors :: {:error, :rich_media_disabled | :validate}
@spec parse(String.t()) :: {:ok, map()} | {:error, any()}
def parse(url) do
@spec parse(String.t()) ::
{:ok, map()} | parse_errors() | Helpers.get_errors()
def parse(url) when is_binary(url) do
with {_, true} <- {:config, @config_impl.get([:rich_media, :enabled])},
:ok <- validate_page_url(url),
{:ok, data} <- parse_url(url) do
{_, :ok} <- {:validate, validate_page_url(url)},
{_, {:ok, data}} <- {:parse, parse_url(url)} do
data = Map.put(data, "url", url)
{:ok, data}
else
{:config, _} -> {:error, :rich_media_disabled}
e -> e
{:validate, _} -> {:error, :validate}
{:parse, error} -> error
end
end
defp parse_url(url) do
with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url),
{:ok, html} <- Floki.parse_document(html) do
with {:ok, body} <- Helpers.rich_media_get(url),
{:ok, html} <- Floki.parse_document(body) do
html
|> maybe_parse()
|> clean_parsed_data()
@ -50,8 +53,8 @@ defp check_parsed_data(%{"title" => title} = data)
{:ok, data}
end
defp check_parsed_data(data) do
{:error, {:invalid_metadata, data}}
defp check_parsed_data(_data) do
{:error, :invalid_metadata}
end
defp clean_parsed_data(data) do

View file

@ -22,7 +22,7 @@ defp get_oembed_url([{"link", attributes, _children} | _]) do
end
defp get_oembed_data(url) do
with {:ok, %Tesla.Env{body: json}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url) do
with {:ok, json} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url) do
Jason.decode(json)
end
end

View file

@ -14,7 +14,17 @@ def perform(%Job{args: %{"op" => "expire", "url" => url} = _args}) do
end
def perform(%Job{args: %{"op" => "backfill", "url" => _url} = args}) do
Backfill.run(args)
case Backfill.run(args) do
:ok ->
:ok
{:error, type}
when type in [:invalid_metadata, :body_too_large, :content_type, :validate] ->
:cancel
error ->
{:error, error}
end
end
@impl Oban.Worker

View file

@ -20,7 +20,7 @@ test "returns error when no metadata present" do
end
test "doesn't just add a title" do
assert {:error, {:invalid_metadata, _}} = Parser.parse("https://example.com/non-ogp")
assert {:error, :invalid_metadata} = Parser.parse("https://example.com/non-ogp")
end
test "parses ogp" do
@ -96,7 +96,7 @@ test "rejects invalid OGP data" do
end
test "returns error if getting page was not successful" do
assert {:error, :overload} = Parser.parse("https://example.com/error")
assert {:error, :get} = Parser.parse("https://example.com/error")
end
test "does a HEAD request to check if the body is too large" do
@ -104,17 +104,17 @@ test "does a HEAD request to check if the body is too large" do
end
test "does a HEAD request to check if the body is html" do
assert {:error, {:content_type, _}} = Parser.parse("https://example.com/pdf-file")
assert {:error, :content_type} = Parser.parse("https://example.com/pdf-file")
end
test "refuses to crawl incomplete URLs" do
url = "example.com/ogp"
assert :error == Parser.parse(url)
assert {:error, :validate} == Parser.parse(url)
end
test "refuses to crawl malformed URLs" do
url = "example.com[]/ogp"
assert :error == Parser.parse(url)
assert {:error, :validate} == Parser.parse(url)
end
test "refuses to crawl URLs of private network from posts" do
@ -126,7 +126,7 @@ test "refuses to crawl URLs of private network from posts" do
"https://pleroma.local/notice/9kCP7V"
]
|> Enum.each(fn url ->
assert :error == Parser.parse(url)
assert {:error, :validate} == Parser.parse(url)
end)
end

View file

@ -1724,7 +1724,7 @@ def post(url, query, body, headers) do
]
def head(url, _query, _body, _headers) when url in @rich_media_mocks do
{:ok, %Tesla.Env{status: 404, body: ""}}
{:ok, %Tesla.Env{status: 200, body: ""}}
end
def head("https://example.com/pdf-file", _, _, _) do