diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c0f1b1f..2c29d5f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ ### Fixed - Fix detecting of width and height of HEIF images that include `irot` boxes. - Set `Error` status for errorred traces in OpenTelemetry. +- Fix URL parsing error when a non-http(s) URL contains a `%` symbol outside of the percent-encoded sequence. - (pro) Fix opject detection accuracy when using YOLOv8 or YOLOv10 models. - (pro) Fix usage of the `obj` and `objw` gravity types inside the `crop` processing option. - (pro) Fix detecting of width and height when orientation is specified in EXIF but EXIF info is not requested. diff --git a/imagedata/download.go b/imagedata/download.go index b7026a2a..13217a04 100644 --- a/imagedata/download.go +++ b/imagedata/download.go @@ -7,7 +7,6 @@ import ( "io" "net/http" "net/http/cookiejar" - "net/url" "strings" "time" @@ -134,23 +133,27 @@ func headersToStore(res *http.Response) map[string]string { func BuildImageRequest(ctx context.Context, imageURL string, header http.Header, jar *cookiejar.Jar) (*http.Request, context.CancelFunc, error) { reqCtx, reqCancel := context.WithTimeout(ctx, time.Duration(config.DownloadTimeout)*time.Second) + // Non-http(s) URLs may contain percent symbol outside of the percent-encoded sequences. + // Parsing such URLs will fail with an error. + // To prevent this, we replace all percent symbols with %25. + // + // Also, such URLs may contain a hash symbol, which is a fragment identifier. + // We replace them with %23 to prevent cutting off the fragment part. + // Since we already replaced all percent symbols, we won't mix up %23 that were in the original URL + // and %23 that appeared after the replacement. + // + // We will revert these replacements in `transport/common.GetBucketAndKey`. + if !strings.HasPrefix(imageURL, "http://") && !strings.HasPrefix(imageURL, "https://") { + imageURL = strings.ReplaceAll(imageURL, "%", "%25") + imageURL = strings.ReplaceAll(imageURL, "#", "%23") + } + req, err := http.NewRequestWithContext(reqCtx, "GET", imageURL, nil) if err != nil { reqCancel() return nil, func() {}, ierrors.New(404, err.Error(), msgSourceImageIsUnreachable) } - // S3, GCS, etc object keys may contain `#` symbol. - // `url.ParseRequestURI` unlike `url.Parse` does not cut-off the fragment part from the URL path. - if req.URL.Scheme != "http" && req.URL.Scheme != "https" { - u, err := url.ParseRequestURI(imageURL) - if err != nil { - reqCancel() - return nil, func() {}, ierrors.New(404, err.Error(), msgSourceImageIsUnreachable) - } - req.URL = u - } - if _, ok := enabledSchemes[req.URL.Scheme]; !ok { reqCancel() return nil, func() {}, ierrors.New( diff --git a/transport/common/common.go b/transport/common/common.go index 228fb5c3..e0a41697 100644 --- a/transport/common/common.go +++ b/transport/common/common.go @@ -21,5 +21,16 @@ func GetBucketAndKey(u *url.URL) (bucket, key string) { key = strings.TrimLeft(key, "/") + // We replaced all `%` with `%25` in `imagedata.BuildImageRequest` to prevent parsing errors. + // Also, we replaced all `#` with `%23` to prevent cutting off the fragment part. + // We need to revert these replacements. + // + // It's important to revert %23 first because the original URL may also contain %23, + // and we don't want to mix them up. + bucket = strings.ReplaceAll(bucket, "%23", "#") + bucket = strings.ReplaceAll(bucket, "%25", "%") + key = strings.ReplaceAll(key, "%23", "#") + key = strings.ReplaceAll(key, "%25", "%") + return }