Improves the speed of the text parser.

This commit is contained in:
Vitor Pamplona
2024-02-21 19:18:30 -05:00
parent 5886c866d3
commit 5b77e39c8b
2 changed files with 299 additions and 63 deletions

View File

@@ -0,0 +1,194 @@
/**
* Copyright (c) 2024 Vitor Pamplona
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
* Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package com.vitorpamplona.amethyst.benchmark
import androidx.benchmark.junit4.BenchmarkRule
import androidx.benchmark.junit4.measureRepeated
import androidx.test.ext.junit.runners.AndroidJUnit4
import com.linkedin.urls.detection.UrlDetector
import com.linkedin.urls.detection.UrlDetectorOptions
import com.vitorpamplona.amethyst.commons.HashTagSegment
import com.vitorpamplona.amethyst.commons.ImageSegment
import com.vitorpamplona.amethyst.commons.LinkSegment
import com.vitorpamplona.amethyst.commons.RichTextParser
import com.vitorpamplona.quartz.events.EmptyTagList
import junit.framework.TestCase.assertNull
import junit.framework.TestCase.assertTrue
import org.junit.Rule
import org.junit.Test
import org.junit.runner.RunWith
@RunWith(AndroidJUnit4::class)
class RichTextParserBenchmark {
@get:Rule
val benchmarkRule = BenchmarkRule()
@Test
fun parseApkUrl() {
benchmarkRule.measureRepeated {
assertNull(
RichTextParser().parseMediaUrl(
"https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-googleplay-universal-v0.83.10.apk",
EmptyTagList,
),
)
}
}
@Test
fun parseImageUrl() {
benchmarkRule.measureRepeated {
assertTrue(
RichTextParser().parseText(
"first https://m.primal.net/HeKw.jpg second",
EmptyTagList,
).paragraphs[0].words[1] is ImageSegment,
)
}
}
@Test
fun parseNoSchemeUrl() {
benchmarkRule.measureRepeated {
assertTrue(
RichTextParser().parseText(
"first amethyst.social second",
EmptyTagList,
).paragraphs[0].words[1] is LinkSegment,
)
}
}
@Test
fun parseHashtag() {
benchmarkRule.measureRepeated {
assertTrue(
RichTextParser().parseText(
"first #amethyst second",
EmptyTagList,
).paragraphs[0].words[1] is HashTagSegment,
)
}
}
@Test
fun computeTestCase1All() {
benchmarkRule.measureRepeated {
RichTextParser().parseText(testCase1, EmptyTagList)
}
}
@Test
fun computeTestCase2All() {
benchmarkRule.measureRepeated {
RichTextParser().parseText(testCase2, EmptyTagList)
}
}
@Test
fun computeTestCase2UrlDetector() {
benchmarkRule.measureRepeated {
UrlDetector(testCase2, UrlDetectorOptions.Default).detect()
}
}
@Test
fun computeTestCase2ParseUrls() {
benchmarkRule.measureRepeated {
RichTextParser().parseValidUrls(testCase2)
}
}
@Test
fun computeTestCase3All() {
benchmarkRule.measureRepeated {
RichTextParser().parseText(testCase3, EmptyTagList)
}
}
val testCase1 = """
#Amethyst v0.83.10
تحديث جديد لـ Amethyst بإصدار 0.83.10 مع تعديلات وإضافات جديدة
: NIP-92 إصلاحات الأخطاء
الإضافات الجديدة:
- يتضمن رابط المنتج في الرسالة الأولى من المشتري في السوق
- يضيف دعمًا لـ NIP-92 في الرسائل العامة والرسائل المباشرة الجديدة (NIP-17). يبقى NIP-54 في NIP-04 DMs
- إضافة التمرير الأفقي إلى أزرار الإجراءات في شاشة النشر الجديد لإصلاح الأزرار المخفية جزئيًا في الشاشات الصغيرة/الرفيعة.
اصلاحات الشوائب:
- إصلاحات التعطل مع مبلغ Zap مخصص غير صالح
- يعمل على إصلاح مشكلات إعادة اتصال التتابع عندما يقوم المرحل بإغلاق الاتصال
- إصلاح الحشو العلوي للملاحظة المقتبسة في المنشور
- تحسين استخدام الذاكرة للمستخدم المرئي وعلامة URL في المشاركات الجديدة
الترجمات المحدثة:
- الفارسية بواسطة
- الفرنسية والإنجليزية، المملكة المتحدة بواسطة
- الأوكرانية
- الإسبانية والإسبانية والمكسيك والإسبانية والولايات المتحدة بواسطة
- العربية
تحسينات جودة الكود:
- تحديثات لنظام Android Studio 2023.1.1 Patch 2
nostr:nevent1qqszq7kl888sw0c5rpvepn8w373zt0jrw8864x8lkauxxw335s66rzgppemhxue69uhkummn9ekx7mp0qgsyvrp9u6p0mfur9dfdru3d853tx9mdjuhkphxuxgfwmryja7zsvhqrqsqqqqqpaax7m2
"""
val testCase2 = """
#Amethyst v0.83.10: NIP-92 and Bug Fixes
New Additions:
- Includes a link to the product in the first message from the buyer in the marketplace
- Adds support for NIP-92 in public messages and new DMs (NIP-17). NIP-54 stays in NIP-04 DMs
- Adds Horizontal Scroll to the action buttons in the New Post screen to partially fix hidden buttons in small/thin screens.
Bugfixes:
- Fixes crash with an invalid custom Zap Amount
- Fixes relay re-connection issues when the relay closes a connection
- Fixes the top padding of the quoted note in a post
- Optimizes memory use of the visual user and url tagger in new posts
Updated translations:
- Persian by nostr:npub1cpazafytvafazxkjn43zjfwtfzatfz508r54f6z6a3rf2ws8223qc3xxpk
- French and English, United Kingdom by nostr:npub13qtw3yu0uc9r4yj5x0rhgy8nj5q0uyeq0pavkgt9ly69uuzxgkfqwvx23t
- Ukrainian by crowdin.com/profile/liizzzz
- Spanish, Spanish, Mexico and Spanish, United States by nostr:npub1luhyzgce7qtcs6r6v00ryjxza8av8u4dzh3avg0zks38tjktnmxspxq903
- Arabic by nostr:npub13qtw3yu0uc9r4yj5x0rhgy8nj5q0uyeq0pavkgt9ly69uuzxgkfqwvx23t
Code Quality Improvements:
- Updates to Android Studio 2023.1.1 Patch 2
Download:
- [Play Edition](https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-googleplay-universal-v0.83.10.apk )
- [FOSS Edition - No translations](https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-fdroid-universal-v0.83.10.apk )
"""
val testCase3 = """#100aDayUntil100k
Day 5 ✔️
Seems like they may be getting easier"""
}

View File

@@ -74,21 +74,20 @@ class RichTextParser() {
} }
} }
fun parseText( fun parseValidUrls(content: String): LinkedHashSet<String> {
content: String,
tags: ImmutableListOfLists<String>,
): RichTextViewerState {
val urls = UrlDetector(content, UrlDetectorOptions.Default).detect() val urls = UrlDetector(content, UrlDetectorOptions.Default).detect()
val urlSet = return urls.mapNotNullTo(LinkedHashSet(urls.size)) {
urls.mapNotNullTo(LinkedHashSet(urls.size)) { if (it.originalUrl.contains("@")) {
// removes e-mails
if (Patterns.EMAIL_ADDRESS.matcher(it.originalUrl).matches()) { if (Patterns.EMAIL_ADDRESS.matcher(it.originalUrl).matches()) {
null null
} else {
it.originalUrl
}
} else if (isNumber(it.originalUrl)) { } else if (isNumber(it.originalUrl)) {
null null // avoids urls that look like 123.22
} else if (it.originalUrl.contains("")) { } else if (it.originalUrl.contains("")) {
null null // avoids Japanese characters as fake urls
} else { } else {
if (HTTPRegex.matches(it.originalUrl)) { if (HTTPRegex.matches(it.originalUrl)) {
it.originalUrl it.originalUrl
@@ -97,6 +96,13 @@ class RichTextParser() {
} }
} }
} }
}
fun parseText(
content: String,
tags: ImmutableListOfLists<String>,
): RichTextViewerState {
val urlSet = parseValidUrls(content)
val imagesForPager = val imagesForPager =
urlSet.mapNotNull { fullUrl -> parseMediaUrl(fullUrl, tags) }.associateBy { it.url } urlSet.mapNotNull { fullUrl -> parseMediaUrl(fullUrl, tags) }.associateBy { it.url }
@@ -153,8 +159,29 @@ class RichTextParser() {
return paragraphSegments.toImmutableList() return paragraphSegments.toImmutableList()
} }
fun isNumber(word: String): Boolean { private fun isNumber(word: String) = numberPattern.matcher(word).matches()
return numberPattern.matcher(word).matches()
private fun isPhoneNumberChar(c: Char): Boolean {
return when (c) {
in '0'..'9' -> true
'-' -> true
' ' -> true
'.' -> true
else -> false
}
}
fun isPotentialPhoneNumber(word: String): Boolean {
if (word.length !in 7..14) return false
var isPotentialNumber = true
for (c in word) {
if (!isPhoneNumberChar(c)) {
isPotentialNumber = false
break
}
}
return isPotentialNumber
} }
fun isDate(word: String): Boolean { fun isDate(word: String): Boolean {
@@ -172,48 +199,50 @@ class RichTextParser() {
emojis: Map<String, String>, emojis: Map<String, String>,
tags: ImmutableListOfLists<String>, tags: ImmutableListOfLists<String>,
): Segment { ): Segment {
val emailMatcher = Patterns.EMAIL_ADDRESS.matcher(word) if (word.isEmpty()) return RegularTextSegment(word)
val phoneMatcher = Patterns.PHONE.matcher(word)
val schemelessMatcher = noProtocolUrlValidator.matcher(word)
return if (word.isEmpty()) { if (images.contains(word)) return ImageSegment(word)
RegularTextSegment(word)
} else if (images.contains(word)) { if (urls.contains(word)) return LinkSegment(word)
ImageSegment(word)
} else if (urls.contains(word)) { if (word.startsWith(":") && emojis.any { word.contains(it.key) }) return EmojiSegment(word)
LinkSegment(word)
} else if (emojis.any { word.contains(it.key) }) { if (word.startsWith("lnbc", true)) return InvoiceSegment(word)
EmojiSegment(word)
} else if (word.startsWith("lnbc", true)) { if (word.startsWith("lnurl", true)) return WithdrawSegment(word)
InvoiceSegment(word)
} else if (word.startsWith("lnurl", true)) { if (word.startsWith("cashuA", true)) return CashuSegment(word)
WithdrawSegment(word)
} else if (word.startsWith("cashuA", true)) { if (startsWithNIP19Scheme(word)) return BechSegment(word)
CashuSegment(word)
} else if (emailMatcher.matches()) { if (word.startsWith("#")) return parseHash(word, tags)
EmailSegment(word)
} else if (word.length in 7..14 && !isDate(word) && phoneMatcher.matches()) { if (word.contains("@")) {
PhoneSegment(word) if (Patterns.EMAIL_ADDRESS.matcher(word).matches()) return EmailSegment(word)
} else if (startsWithNIP19Scheme(word)) { }
BechSegment(word)
} else if (word.startsWith("#")) { if (isPotentialPhoneNumber(word) && !isDate(word)) {
parseHash(word, tags) if (Patterns.PHONE.matcher(word).matches()) return PhoneSegment(word)
} else if (word.contains(".") && schemelessMatcher.find()) { }
val indexOfPeriod = word.indexOf(".")
if (indexOfPeriod > 0 && indexOfPeriod < word.length - 1) { // periods cannot be the last one
val schemelessMatcher = noProtocolUrlValidator.matcher(word)
if (schemelessMatcher.find()) {
val url = schemelessMatcher.group(1) // url val url = schemelessMatcher.group(1) // url
val additionalChars = schemelessMatcher.group(4).ifEmpty { null } // additional chars val additionalChars = schemelessMatcher.group(4).ifEmpty { null } // additional chars
val pattern = val pattern =
"""^([A-Za-z0-9-_]+(\.[A-Za-z0-9-_]+)+)(:[0-9]+)?(/[^?#]*)?(\?[^#]*)?(#.*)?""" """^([A-Za-z0-9-_]+(\.[A-Za-z0-9-_]+)+)(:[0-9]+)?(/[^?#]*)?(\?[^#]*)?(#.*)?"""
.toRegex(RegexOption.IGNORE_CASE) .toRegex(RegexOption.IGNORE_CASE)
if (pattern.find(word) != null) { if (pattern.find(word) != null && url != null) {
SchemelessUrlSegment(word, url, additionalChars) return SchemelessUrlSegment(word, url, additionalChars)
} else {
RegularTextSegment(word)
} }
} else {
RegularTextSegment(word)
} }
} }
return RegularTextSegment(word)
}
private fun parseHash( private fun parseHash(
word: String, word: String,
tags: ImmutableListOfLists<String>, tags: ImmutableListOfLists<String>,
@@ -289,7 +318,11 @@ class RichTextParser() {
val hashTagsPattern: Pattern = val hashTagsPattern: Pattern =
Pattern.compile("#([^\\s!@#\$%^&*()=+./,\\[{\\]};:'\"?><]+)(.*)", Pattern.CASE_INSENSITIVE) Pattern.compile("#([^\\s!@#\$%^&*()=+./,\\[{\\]};:'\"?><]+)(.*)", Pattern.CASE_INSENSITIVE)
val acceptedNIP19schemes = listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1") val acceptedNIP19schemes =
listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1") +
listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1").map {
it.uppercase()
}
private fun removeQueryParamsForExtensionComparison(fullUrl: String): String { private fun removeQueryParamsForExtensionComparison(fullUrl: String): String {
return if (fullUrl.contains("?")) { return if (fullUrl.contains("?")) {
@@ -344,9 +377,18 @@ class RichTextParser() {
} }
fun startsWithNIP19Scheme(word: String): Boolean { fun startsWithNIP19Scheme(word: String): Boolean {
val cleaned = word.lowercase().removePrefix("@").removePrefix("nostr:").removePrefix("@") if (word.isEmpty()) return false
return if (word[0] == 'n' || word[0] == 'N') {
return acceptedNIP19schemes.any { cleaned.startsWith(it) } if (word.startsWith("nostr:n") || word.startsWith("NOSTR:N")) {
acceptedNIP19schemes.any { word.startsWith(it, 6) }
} else {
acceptedNIP19schemes.any { word.startsWith(it) }
}
} else if (word[0] == '@') {
acceptedNIP19schemes.any { word.startsWith(it, 1) }
} else {
false
}
} }
fun isUrlWithoutScheme(url: String) = noProtocolUrlValidator.matcher(url).matches() fun isUrlWithoutScheme(url: String) = noProtocolUrlValidator.matcher(url).matches()