From 5b77e39c8b0e80319a3f44ee5549bb9788bf6176 Mon Sep 17 00:00:00 2001 From: Vitor Pamplona Date: Wed, 21 Feb 2024 19:18:30 -0500 Subject: [PATCH] Improves the speed of the text parser. --- .../benchmark/RichTextParserBenchmark.kt | 194 ++++++++++++++++++ .../amethyst/commons/RichTextParser.kt | 168 +++++++++------ 2 files changed, 299 insertions(+), 63 deletions(-) create mode 100644 benchmark/src/androidTest/java/com/vitorpamplona/amethyst/benchmark/RichTextParserBenchmark.kt diff --git a/benchmark/src/androidTest/java/com/vitorpamplona/amethyst/benchmark/RichTextParserBenchmark.kt b/benchmark/src/androidTest/java/com/vitorpamplona/amethyst/benchmark/RichTextParserBenchmark.kt new file mode 100644 index 000000000..0cbb36140 --- /dev/null +++ b/benchmark/src/androidTest/java/com/vitorpamplona/amethyst/benchmark/RichTextParserBenchmark.kt @@ -0,0 +1,194 @@ +/** + * Copyright (c) 2024 Vitor Pamplona + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +package com.vitorpamplona.amethyst.benchmark + +import androidx.benchmark.junit4.BenchmarkRule +import androidx.benchmark.junit4.measureRepeated +import androidx.test.ext.junit.runners.AndroidJUnit4 +import com.linkedin.urls.detection.UrlDetector +import com.linkedin.urls.detection.UrlDetectorOptions +import com.vitorpamplona.amethyst.commons.HashTagSegment +import com.vitorpamplona.amethyst.commons.ImageSegment +import com.vitorpamplona.amethyst.commons.LinkSegment +import com.vitorpamplona.amethyst.commons.RichTextParser +import com.vitorpamplona.quartz.events.EmptyTagList +import junit.framework.TestCase.assertNull +import junit.framework.TestCase.assertTrue +import org.junit.Rule +import org.junit.Test +import org.junit.runner.RunWith + +@RunWith(AndroidJUnit4::class) +class RichTextParserBenchmark { + @get:Rule + val benchmarkRule = BenchmarkRule() + + @Test + fun parseApkUrl() { + benchmarkRule.measureRepeated { + assertNull( + RichTextParser().parseMediaUrl( + "https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-googleplay-universal-v0.83.10.apk", + EmptyTagList, + ), + ) + } + } + + @Test + fun parseImageUrl() { + benchmarkRule.measureRepeated { + assertTrue( + RichTextParser().parseText( + "first https://m.primal.net/HeKw.jpg second", + EmptyTagList, + ).paragraphs[0].words[1] is ImageSegment, + ) + } + } + + @Test + fun parseNoSchemeUrl() { + benchmarkRule.measureRepeated { + assertTrue( + RichTextParser().parseText( + "first amethyst.social second", + EmptyTagList, + ).paragraphs[0].words[1] is LinkSegment, + ) + } + } + + @Test + fun parseHashtag() { + benchmarkRule.measureRepeated { + assertTrue( + RichTextParser().parseText( + "first #amethyst second", + EmptyTagList, + ).paragraphs[0].words[1] is HashTagSegment, + ) + } + } + + @Test + fun computeTestCase1All() { + benchmarkRule.measureRepeated { + RichTextParser().parseText(testCase1, EmptyTagList) + } + } + + @Test + fun computeTestCase2All() { + benchmarkRule.measureRepeated { + RichTextParser().parseText(testCase2, EmptyTagList) + } + } + + @Test + fun computeTestCase2UrlDetector() { + benchmarkRule.measureRepeated { + UrlDetector(testCase2, UrlDetectorOptions.Default).detect() + } + } + + @Test + fun computeTestCase2ParseUrls() { + benchmarkRule.measureRepeated { + RichTextParser().parseValidUrls(testCase2) + } + } + + @Test + fun computeTestCase3All() { + benchmarkRule.measureRepeated { + RichTextParser().parseText(testCase3, EmptyTagList) + } + } + + val testCase1 = """ +#Amethyst v0.83.10 + +تحديث جديد لـ Amethyst بإصدار 0.83.10 مع تعديلات وإضافات جديدة + +: NIP-92 إصلاحات الأخطاء + + الإضافات الجديدة: + - يتضمن رابط المنتج في الرسالة الأولى من المشتري في السوق + - يضيف دعمًا لـ NIP-92 في الرسائل العامة والرسائل المباشرة الجديدة (NIP-17). يبقى NIP-54 في NIP-04 DMs + - إضافة التمرير الأفقي إلى أزرار الإجراءات في شاشة النشر الجديد لإصلاح الأزرار المخفية جزئيًا في الشاشات الصغيرة/الرفيعة. + + اصلاحات الشوائب: + - إصلاحات التعطل مع مبلغ Zap مخصص غير صالح + - يعمل على إصلاح مشكلات إعادة اتصال التتابع عندما يقوم المرحل بإغلاق الاتصال + - إصلاح الحشو العلوي للملاحظة المقتبسة في المنشور + - تحسين استخدام الذاكرة للمستخدم المرئي وعلامة URL في المشاركات الجديدة + + الترجمات المحدثة: + - الفارسية بواسطة + - الفرنسية والإنجليزية، المملكة المتحدة بواسطة + - الأوكرانية + - الإسبانية والإسبانية والمكسيك والإسبانية والولايات المتحدة بواسطة + - العربية + + تحسينات جودة الكود: + - تحديثات لنظام Android Studio 2023.1.1 Patch 2 + + + + +nostr:nevent1qqszq7kl888sw0c5rpvepn8w373zt0jrw8864x8lkauxxw335s66rzgppemhxue69uhkummn9ekx7mp0qgsyvrp9u6p0mfur9dfdru3d853tx9mdjuhkphxuxgfwmryja7zsvhqrqsqqqqqpaax7m2 +""" + + val testCase2 = """ +#Amethyst v0.83.10: NIP-92 and Bug Fixes + +New Additions: +- Includes a link to the product in the first message from the buyer in the marketplace +- Adds support for NIP-92 in public messages and new DMs (NIP-17). NIP-54 stays in NIP-04 DMs +- Adds Horizontal Scroll to the action buttons in the New Post screen to partially fix hidden buttons in small/thin screens. + +Bugfixes: +- Fixes crash with an invalid custom Zap Amount +- Fixes relay re-connection issues when the relay closes a connection +- Fixes the top padding of the quoted note in a post +- Optimizes memory use of the visual user and url tagger in new posts + +Updated translations: +- Persian by nostr:npub1cpazafytvafazxkjn43zjfwtfzatfz508r54f6z6a3rf2ws8223qc3xxpk +- French and English, United Kingdom by nostr:npub13qtw3yu0uc9r4yj5x0rhgy8nj5q0uyeq0pavkgt9ly69uuzxgkfqwvx23t +- Ukrainian by crowdin.com/profile/liizzzz +- Spanish, Spanish, Mexico and Spanish, United States by nostr:npub1luhyzgce7qtcs6r6v00ryjxza8av8u4dzh3avg0zks38tjktnmxspxq903 +- Arabic by nostr:npub13qtw3yu0uc9r4yj5x0rhgy8nj5q0uyeq0pavkgt9ly69uuzxgkfqwvx23t + +Code Quality Improvements: +- Updates to Android Studio 2023.1.1 Patch 2 + +Download: +- [Play Edition](https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-googleplay-universal-v0.83.10.apk ) +- [FOSS Edition - No translations](https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-fdroid-universal-v0.83.10.apk ) +""" + + val testCase3 = """#100aDayUntil100k +Day 5 ✔️ + +Seems like they may be getting easier""" +} diff --git a/commons/src/main/java/com/vitorpamplona/amethyst/commons/RichTextParser.kt b/commons/src/main/java/com/vitorpamplona/amethyst/commons/RichTextParser.kt index 3833912f8..98eaafe26 100644 --- a/commons/src/main/java/com/vitorpamplona/amethyst/commons/RichTextParser.kt +++ b/commons/src/main/java/com/vitorpamplona/amethyst/commons/RichTextParser.kt @@ -74,29 +74,35 @@ class RichTextParser() { } } + fun parseValidUrls(content: String): LinkedHashSet { + val urls = UrlDetector(content, UrlDetectorOptions.Default).detect() + + return urls.mapNotNullTo(LinkedHashSet(urls.size)) { + if (it.originalUrl.contains("@")) { + if (Patterns.EMAIL_ADDRESS.matcher(it.originalUrl).matches()) { + null + } else { + it.originalUrl + } + } else if (isNumber(it.originalUrl)) { + null // avoids urls that look like 123.22 + } else if (it.originalUrl.contains("。")) { + null // avoids Japanese characters as fake urls + } else { + if (HTTPRegex.matches(it.originalUrl)) { + it.originalUrl + } else { + null + } + } + } + } + fun parseText( content: String, tags: ImmutableListOfLists, ): RichTextViewerState { - val urls = UrlDetector(content, UrlDetectorOptions.Default).detect() - - val urlSet = - urls.mapNotNullTo(LinkedHashSet(urls.size)) { - // removes e-mails - if (Patterns.EMAIL_ADDRESS.matcher(it.originalUrl).matches()) { - null - } else if (isNumber(it.originalUrl)) { - null - } else if (it.originalUrl.contains("。")) { - null - } else { - if (HTTPRegex.matches(it.originalUrl)) { - it.originalUrl - } else { - null - } - } - } + val urlSet = parseValidUrls(content) val imagesForPager = urlSet.mapNotNull { fullUrl -> parseMediaUrl(fullUrl, tags) }.associateBy { it.url } @@ -153,8 +159,29 @@ class RichTextParser() { return paragraphSegments.toImmutableList() } - fun isNumber(word: String): Boolean { - return numberPattern.matcher(word).matches() + private fun isNumber(word: String) = numberPattern.matcher(word).matches() + + private fun isPhoneNumberChar(c: Char): Boolean { + return when (c) { + in '0'..'9' -> true + '-' -> true + ' ' -> true + '.' -> true + else -> false + } + } + + fun isPotentialPhoneNumber(word: String): Boolean { + if (word.length !in 7..14) return false + var isPotentialNumber = true + + for (c in word) { + if (!isPhoneNumberChar(c)) { + isPotentialNumber = false + break + } + } + return isPotentialNumber } fun isDate(word: String): Boolean { @@ -172,46 +199,48 @@ class RichTextParser() { emojis: Map, tags: ImmutableListOfLists, ): Segment { - val emailMatcher = Patterns.EMAIL_ADDRESS.matcher(word) - val phoneMatcher = Patterns.PHONE.matcher(word) - val schemelessMatcher = noProtocolUrlValidator.matcher(word) + if (word.isEmpty()) return RegularTextSegment(word) - return if (word.isEmpty()) { - RegularTextSegment(word) - } else if (images.contains(word)) { - ImageSegment(word) - } else if (urls.contains(word)) { - LinkSegment(word) - } else if (emojis.any { word.contains(it.key) }) { - EmojiSegment(word) - } else if (word.startsWith("lnbc", true)) { - InvoiceSegment(word) - } else if (word.startsWith("lnurl", true)) { - WithdrawSegment(word) - } else if (word.startsWith("cashuA", true)) { - CashuSegment(word) - } else if (emailMatcher.matches()) { - EmailSegment(word) - } else if (word.length in 7..14 && !isDate(word) && phoneMatcher.matches()) { - PhoneSegment(word) - } else if (startsWithNIP19Scheme(word)) { - BechSegment(word) - } else if (word.startsWith("#")) { - parseHash(word, tags) - } else if (word.contains(".") && schemelessMatcher.find()) { - val url = schemelessMatcher.group(1) // url - val additionalChars = schemelessMatcher.group(4).ifEmpty { null } // additional chars - val pattern = - """^([A-Za-z0-9-_]+(\.[A-Za-z0-9-_]+)+)(:[0-9]+)?(/[^?#]*)?(\?[^#]*)?(#.*)?""" - .toRegex(RegexOption.IGNORE_CASE) - if (pattern.find(word) != null) { - SchemelessUrlSegment(word, url, additionalChars) - } else { - RegularTextSegment(word) - } - } else { - RegularTextSegment(word) + if (images.contains(word)) return ImageSegment(word) + + if (urls.contains(word)) return LinkSegment(word) + + if (word.startsWith(":") && emojis.any { word.contains(it.key) }) return EmojiSegment(word) + + if (word.startsWith("lnbc", true)) return InvoiceSegment(word) + + if (word.startsWith("lnurl", true)) return WithdrawSegment(word) + + if (word.startsWith("cashuA", true)) return CashuSegment(word) + + if (startsWithNIP19Scheme(word)) return BechSegment(word) + + if (word.startsWith("#")) return parseHash(word, tags) + + if (word.contains("@")) { + if (Patterns.EMAIL_ADDRESS.matcher(word).matches()) return EmailSegment(word) } + + if (isPotentialPhoneNumber(word) && !isDate(word)) { + if (Patterns.PHONE.matcher(word).matches()) return PhoneSegment(word) + } + + val indexOfPeriod = word.indexOf(".") + if (indexOfPeriod > 0 && indexOfPeriod < word.length - 1) { // periods cannot be the last one + val schemelessMatcher = noProtocolUrlValidator.matcher(word) + if (schemelessMatcher.find()) { + val url = schemelessMatcher.group(1) // url + val additionalChars = schemelessMatcher.group(4).ifEmpty { null } // additional chars + val pattern = + """^([A-Za-z0-9-_]+(\.[A-Za-z0-9-_]+)+)(:[0-9]+)?(/[^?#]*)?(\?[^#]*)?(#.*)?""" + .toRegex(RegexOption.IGNORE_CASE) + if (pattern.find(word) != null && url != null) { + return SchemelessUrlSegment(word, url, additionalChars) + } + } + } + + return RegularTextSegment(word) } private fun parseHash( @@ -289,7 +318,11 @@ class RichTextParser() { val hashTagsPattern: Pattern = Pattern.compile("#([^\\s!@#\$%^&*()=+./,\\[{\\]};:'\"?><]+)(.*)", Pattern.CASE_INSENSITIVE) - val acceptedNIP19schemes = listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1") + val acceptedNIP19schemes = + listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1") + + listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1").map { + it.uppercase() + } private fun removeQueryParamsForExtensionComparison(fullUrl: String): String { return if (fullUrl.contains("?")) { @@ -344,9 +377,18 @@ class RichTextParser() { } fun startsWithNIP19Scheme(word: String): Boolean { - val cleaned = word.lowercase().removePrefix("@").removePrefix("nostr:").removePrefix("@") - - return acceptedNIP19schemes.any { cleaned.startsWith(it) } + if (word.isEmpty()) return false + return if (word[0] == 'n' || word[0] == 'N') { + if (word.startsWith("nostr:n") || word.startsWith("NOSTR:N")) { + acceptedNIP19schemes.any { word.startsWith(it, 6) } + } else { + acceptedNIP19schemes.any { word.startsWith(it) } + } + } else if (word[0] == '@') { + acceptedNIP19schemes.any { word.startsWith(it, 1) } + } else { + false + } } fun isUrlWithoutScheme(url: String) = noProtocolUrlValidator.matcher(url).matches()