mirror of
https://github.com/vitorpamplona/amethyst.git
synced 2025-11-10 20:36:45 +01:00
Improves the speed of the text parser.
This commit is contained in:
@@ -0,0 +1,194 @@
|
|||||||
|
/**
|
||||||
|
* Copyright (c) 2024 Vitor Pamplona
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||||
|
* this software and associated documentation files (the "Software"), to deal in
|
||||||
|
* the Software without restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
|
||||||
|
* Software, and to permit persons to whom the Software is furnished to do so,
|
||||||
|
* subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in all
|
||||||
|
* copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||||
|
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||||
|
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
|
||||||
|
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||||
|
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
package com.vitorpamplona.amethyst.benchmark
|
||||||
|
|
||||||
|
import androidx.benchmark.junit4.BenchmarkRule
|
||||||
|
import androidx.benchmark.junit4.measureRepeated
|
||||||
|
import androidx.test.ext.junit.runners.AndroidJUnit4
|
||||||
|
import com.linkedin.urls.detection.UrlDetector
|
||||||
|
import com.linkedin.urls.detection.UrlDetectorOptions
|
||||||
|
import com.vitorpamplona.amethyst.commons.HashTagSegment
|
||||||
|
import com.vitorpamplona.amethyst.commons.ImageSegment
|
||||||
|
import com.vitorpamplona.amethyst.commons.LinkSegment
|
||||||
|
import com.vitorpamplona.amethyst.commons.RichTextParser
|
||||||
|
import com.vitorpamplona.quartz.events.EmptyTagList
|
||||||
|
import junit.framework.TestCase.assertNull
|
||||||
|
import junit.framework.TestCase.assertTrue
|
||||||
|
import org.junit.Rule
|
||||||
|
import org.junit.Test
|
||||||
|
import org.junit.runner.RunWith
|
||||||
|
|
||||||
|
@RunWith(AndroidJUnit4::class)
|
||||||
|
class RichTextParserBenchmark {
|
||||||
|
@get:Rule
|
||||||
|
val benchmarkRule = BenchmarkRule()
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun parseApkUrl() {
|
||||||
|
benchmarkRule.measureRepeated {
|
||||||
|
assertNull(
|
||||||
|
RichTextParser().parseMediaUrl(
|
||||||
|
"https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-googleplay-universal-v0.83.10.apk",
|
||||||
|
EmptyTagList,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun parseImageUrl() {
|
||||||
|
benchmarkRule.measureRepeated {
|
||||||
|
assertTrue(
|
||||||
|
RichTextParser().parseText(
|
||||||
|
"first https://m.primal.net/HeKw.jpg second",
|
||||||
|
EmptyTagList,
|
||||||
|
).paragraphs[0].words[1] is ImageSegment,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun parseNoSchemeUrl() {
|
||||||
|
benchmarkRule.measureRepeated {
|
||||||
|
assertTrue(
|
||||||
|
RichTextParser().parseText(
|
||||||
|
"first amethyst.social second",
|
||||||
|
EmptyTagList,
|
||||||
|
).paragraphs[0].words[1] is LinkSegment,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun parseHashtag() {
|
||||||
|
benchmarkRule.measureRepeated {
|
||||||
|
assertTrue(
|
||||||
|
RichTextParser().parseText(
|
||||||
|
"first #amethyst second",
|
||||||
|
EmptyTagList,
|
||||||
|
).paragraphs[0].words[1] is HashTagSegment,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun computeTestCase1All() {
|
||||||
|
benchmarkRule.measureRepeated {
|
||||||
|
RichTextParser().parseText(testCase1, EmptyTagList)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun computeTestCase2All() {
|
||||||
|
benchmarkRule.measureRepeated {
|
||||||
|
RichTextParser().parseText(testCase2, EmptyTagList)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun computeTestCase2UrlDetector() {
|
||||||
|
benchmarkRule.measureRepeated {
|
||||||
|
UrlDetector(testCase2, UrlDetectorOptions.Default).detect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun computeTestCase2ParseUrls() {
|
||||||
|
benchmarkRule.measureRepeated {
|
||||||
|
RichTextParser().parseValidUrls(testCase2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun computeTestCase3All() {
|
||||||
|
benchmarkRule.measureRepeated {
|
||||||
|
RichTextParser().parseText(testCase3, EmptyTagList)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
val testCase1 = """
|
||||||
|
#Amethyst v0.83.10
|
||||||
|
|
||||||
|
تحديث جديد لـ Amethyst بإصدار 0.83.10 مع تعديلات وإضافات جديدة
|
||||||
|
|
||||||
|
: NIP-92 إصلاحات الأخطاء
|
||||||
|
|
||||||
|
الإضافات الجديدة:
|
||||||
|
- يتضمن رابط المنتج في الرسالة الأولى من المشتري في السوق
|
||||||
|
- يضيف دعمًا لـ NIP-92 في الرسائل العامة والرسائل المباشرة الجديدة (NIP-17). يبقى NIP-54 في NIP-04 DMs
|
||||||
|
- إضافة التمرير الأفقي إلى أزرار الإجراءات في شاشة النشر الجديد لإصلاح الأزرار المخفية جزئيًا في الشاشات الصغيرة/الرفيعة.
|
||||||
|
|
||||||
|
اصلاحات الشوائب:
|
||||||
|
- إصلاحات التعطل مع مبلغ Zap مخصص غير صالح
|
||||||
|
- يعمل على إصلاح مشكلات إعادة اتصال التتابع عندما يقوم المرحل بإغلاق الاتصال
|
||||||
|
- إصلاح الحشو العلوي للملاحظة المقتبسة في المنشور
|
||||||
|
- تحسين استخدام الذاكرة للمستخدم المرئي وعلامة URL في المشاركات الجديدة
|
||||||
|
|
||||||
|
الترجمات المحدثة:
|
||||||
|
- الفارسية بواسطة
|
||||||
|
- الفرنسية والإنجليزية، المملكة المتحدة بواسطة
|
||||||
|
- الأوكرانية
|
||||||
|
- الإسبانية والإسبانية والمكسيك والإسبانية والولايات المتحدة بواسطة
|
||||||
|
- العربية
|
||||||
|
|
||||||
|
تحسينات جودة الكود:
|
||||||
|
- تحديثات لنظام Android Studio 2023.1.1 Patch 2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
nostr:nevent1qqszq7kl888sw0c5rpvepn8w373zt0jrw8864x8lkauxxw335s66rzgppemhxue69uhkummn9ekx7mp0qgsyvrp9u6p0mfur9dfdru3d853tx9mdjuhkphxuxgfwmryja7zsvhqrqsqqqqqpaax7m2
|
||||||
|
"""
|
||||||
|
|
||||||
|
val testCase2 = """
|
||||||
|
#Amethyst v0.83.10: NIP-92 and Bug Fixes
|
||||||
|
|
||||||
|
New Additions:
|
||||||
|
- Includes a link to the product in the first message from the buyer in the marketplace
|
||||||
|
- Adds support for NIP-92 in public messages and new DMs (NIP-17). NIP-54 stays in NIP-04 DMs
|
||||||
|
- Adds Horizontal Scroll to the action buttons in the New Post screen to partially fix hidden buttons in small/thin screens.
|
||||||
|
|
||||||
|
Bugfixes:
|
||||||
|
- Fixes crash with an invalid custom Zap Amount
|
||||||
|
- Fixes relay re-connection issues when the relay closes a connection
|
||||||
|
- Fixes the top padding of the quoted note in a post
|
||||||
|
- Optimizes memory use of the visual user and url tagger in new posts
|
||||||
|
|
||||||
|
Updated translations:
|
||||||
|
- Persian by nostr:npub1cpazafytvafazxkjn43zjfwtfzatfz508r54f6z6a3rf2ws8223qc3xxpk
|
||||||
|
- French and English, United Kingdom by nostr:npub13qtw3yu0uc9r4yj5x0rhgy8nj5q0uyeq0pavkgt9ly69uuzxgkfqwvx23t
|
||||||
|
- Ukrainian by crowdin.com/profile/liizzzz
|
||||||
|
- Spanish, Spanish, Mexico and Spanish, United States by nostr:npub1luhyzgce7qtcs6r6v00ryjxza8av8u4dzh3avg0zks38tjktnmxspxq903
|
||||||
|
- Arabic by nostr:npub13qtw3yu0uc9r4yj5x0rhgy8nj5q0uyeq0pavkgt9ly69uuzxgkfqwvx23t
|
||||||
|
|
||||||
|
Code Quality Improvements:
|
||||||
|
- Updates to Android Studio 2023.1.1 Patch 2
|
||||||
|
|
||||||
|
Download:
|
||||||
|
- [Play Edition](https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-googleplay-universal-v0.83.10.apk )
|
||||||
|
- [FOSS Edition - No translations](https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-fdroid-universal-v0.83.10.apk )
|
||||||
|
"""
|
||||||
|
|
||||||
|
val testCase3 = """#100aDayUntil100k
|
||||||
|
Day 5 ✔️
|
||||||
|
|
||||||
|
Seems like they may be getting easier"""
|
||||||
|
}
|
||||||
@@ -74,21 +74,20 @@ class RichTextParser() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fun parseText(
|
fun parseValidUrls(content: String): LinkedHashSet<String> {
|
||||||
content: String,
|
|
||||||
tags: ImmutableListOfLists<String>,
|
|
||||||
): RichTextViewerState {
|
|
||||||
val urls = UrlDetector(content, UrlDetectorOptions.Default).detect()
|
val urls = UrlDetector(content, UrlDetectorOptions.Default).detect()
|
||||||
|
|
||||||
val urlSet =
|
return urls.mapNotNullTo(LinkedHashSet(urls.size)) {
|
||||||
urls.mapNotNullTo(LinkedHashSet(urls.size)) {
|
if (it.originalUrl.contains("@")) {
|
||||||
// removes e-mails
|
|
||||||
if (Patterns.EMAIL_ADDRESS.matcher(it.originalUrl).matches()) {
|
if (Patterns.EMAIL_ADDRESS.matcher(it.originalUrl).matches()) {
|
||||||
null
|
null
|
||||||
|
} else {
|
||||||
|
it.originalUrl
|
||||||
|
}
|
||||||
} else if (isNumber(it.originalUrl)) {
|
} else if (isNumber(it.originalUrl)) {
|
||||||
null
|
null // avoids urls that look like 123.22
|
||||||
} else if (it.originalUrl.contains("。")) {
|
} else if (it.originalUrl.contains("。")) {
|
||||||
null
|
null // avoids Japanese characters as fake urls
|
||||||
} else {
|
} else {
|
||||||
if (HTTPRegex.matches(it.originalUrl)) {
|
if (HTTPRegex.matches(it.originalUrl)) {
|
||||||
it.originalUrl
|
it.originalUrl
|
||||||
@@ -97,6 +96,13 @@ class RichTextParser() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fun parseText(
|
||||||
|
content: String,
|
||||||
|
tags: ImmutableListOfLists<String>,
|
||||||
|
): RichTextViewerState {
|
||||||
|
val urlSet = parseValidUrls(content)
|
||||||
|
|
||||||
val imagesForPager =
|
val imagesForPager =
|
||||||
urlSet.mapNotNull { fullUrl -> parseMediaUrl(fullUrl, tags) }.associateBy { it.url }
|
urlSet.mapNotNull { fullUrl -> parseMediaUrl(fullUrl, tags) }.associateBy { it.url }
|
||||||
@@ -153,8 +159,29 @@ class RichTextParser() {
|
|||||||
return paragraphSegments.toImmutableList()
|
return paragraphSegments.toImmutableList()
|
||||||
}
|
}
|
||||||
|
|
||||||
fun isNumber(word: String): Boolean {
|
private fun isNumber(word: String) = numberPattern.matcher(word).matches()
|
||||||
return numberPattern.matcher(word).matches()
|
|
||||||
|
private fun isPhoneNumberChar(c: Char): Boolean {
|
||||||
|
return when (c) {
|
||||||
|
in '0'..'9' -> true
|
||||||
|
'-' -> true
|
||||||
|
' ' -> true
|
||||||
|
'.' -> true
|
||||||
|
else -> false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fun isPotentialPhoneNumber(word: String): Boolean {
|
||||||
|
if (word.length !in 7..14) return false
|
||||||
|
var isPotentialNumber = true
|
||||||
|
|
||||||
|
for (c in word) {
|
||||||
|
if (!isPhoneNumberChar(c)) {
|
||||||
|
isPotentialNumber = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return isPotentialNumber
|
||||||
}
|
}
|
||||||
|
|
||||||
fun isDate(word: String): Boolean {
|
fun isDate(word: String): Boolean {
|
||||||
@@ -172,48 +199,50 @@ class RichTextParser() {
|
|||||||
emojis: Map<String, String>,
|
emojis: Map<String, String>,
|
||||||
tags: ImmutableListOfLists<String>,
|
tags: ImmutableListOfLists<String>,
|
||||||
): Segment {
|
): Segment {
|
||||||
val emailMatcher = Patterns.EMAIL_ADDRESS.matcher(word)
|
if (word.isEmpty()) return RegularTextSegment(word)
|
||||||
val phoneMatcher = Patterns.PHONE.matcher(word)
|
|
||||||
val schemelessMatcher = noProtocolUrlValidator.matcher(word)
|
|
||||||
|
|
||||||
return if (word.isEmpty()) {
|
if (images.contains(word)) return ImageSegment(word)
|
||||||
RegularTextSegment(word)
|
|
||||||
} else if (images.contains(word)) {
|
if (urls.contains(word)) return LinkSegment(word)
|
||||||
ImageSegment(word)
|
|
||||||
} else if (urls.contains(word)) {
|
if (word.startsWith(":") && emojis.any { word.contains(it.key) }) return EmojiSegment(word)
|
||||||
LinkSegment(word)
|
|
||||||
} else if (emojis.any { word.contains(it.key) }) {
|
if (word.startsWith("lnbc", true)) return InvoiceSegment(word)
|
||||||
EmojiSegment(word)
|
|
||||||
} else if (word.startsWith("lnbc", true)) {
|
if (word.startsWith("lnurl", true)) return WithdrawSegment(word)
|
||||||
InvoiceSegment(word)
|
|
||||||
} else if (word.startsWith("lnurl", true)) {
|
if (word.startsWith("cashuA", true)) return CashuSegment(word)
|
||||||
WithdrawSegment(word)
|
|
||||||
} else if (word.startsWith("cashuA", true)) {
|
if (startsWithNIP19Scheme(word)) return BechSegment(word)
|
||||||
CashuSegment(word)
|
|
||||||
} else if (emailMatcher.matches()) {
|
if (word.startsWith("#")) return parseHash(word, tags)
|
||||||
EmailSegment(word)
|
|
||||||
} else if (word.length in 7..14 && !isDate(word) && phoneMatcher.matches()) {
|
if (word.contains("@")) {
|
||||||
PhoneSegment(word)
|
if (Patterns.EMAIL_ADDRESS.matcher(word).matches()) return EmailSegment(word)
|
||||||
} else if (startsWithNIP19Scheme(word)) {
|
}
|
||||||
BechSegment(word)
|
|
||||||
} else if (word.startsWith("#")) {
|
if (isPotentialPhoneNumber(word) && !isDate(word)) {
|
||||||
parseHash(word, tags)
|
if (Patterns.PHONE.matcher(word).matches()) return PhoneSegment(word)
|
||||||
} else if (word.contains(".") && schemelessMatcher.find()) {
|
}
|
||||||
|
|
||||||
|
val indexOfPeriod = word.indexOf(".")
|
||||||
|
if (indexOfPeriod > 0 && indexOfPeriod < word.length - 1) { // periods cannot be the last one
|
||||||
|
val schemelessMatcher = noProtocolUrlValidator.matcher(word)
|
||||||
|
if (schemelessMatcher.find()) {
|
||||||
val url = schemelessMatcher.group(1) // url
|
val url = schemelessMatcher.group(1) // url
|
||||||
val additionalChars = schemelessMatcher.group(4).ifEmpty { null } // additional chars
|
val additionalChars = schemelessMatcher.group(4).ifEmpty { null } // additional chars
|
||||||
val pattern =
|
val pattern =
|
||||||
"""^([A-Za-z0-9-_]+(\.[A-Za-z0-9-_]+)+)(:[0-9]+)?(/[^?#]*)?(\?[^#]*)?(#.*)?"""
|
"""^([A-Za-z0-9-_]+(\.[A-Za-z0-9-_]+)+)(:[0-9]+)?(/[^?#]*)?(\?[^#]*)?(#.*)?"""
|
||||||
.toRegex(RegexOption.IGNORE_CASE)
|
.toRegex(RegexOption.IGNORE_CASE)
|
||||||
if (pattern.find(word) != null) {
|
if (pattern.find(word) != null && url != null) {
|
||||||
SchemelessUrlSegment(word, url, additionalChars)
|
return SchemelessUrlSegment(word, url, additionalChars)
|
||||||
} else {
|
|
||||||
RegularTextSegment(word)
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
RegularTextSegment(word)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return RegularTextSegment(word)
|
||||||
|
}
|
||||||
|
|
||||||
private fun parseHash(
|
private fun parseHash(
|
||||||
word: String,
|
word: String,
|
||||||
tags: ImmutableListOfLists<String>,
|
tags: ImmutableListOfLists<String>,
|
||||||
@@ -289,7 +318,11 @@ class RichTextParser() {
|
|||||||
val hashTagsPattern: Pattern =
|
val hashTagsPattern: Pattern =
|
||||||
Pattern.compile("#([^\\s!@#\$%^&*()=+./,\\[{\\]};:'\"?><]+)(.*)", Pattern.CASE_INSENSITIVE)
|
Pattern.compile("#([^\\s!@#\$%^&*()=+./,\\[{\\]};:'\"?><]+)(.*)", Pattern.CASE_INSENSITIVE)
|
||||||
|
|
||||||
val acceptedNIP19schemes = listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1")
|
val acceptedNIP19schemes =
|
||||||
|
listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1") +
|
||||||
|
listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1").map {
|
||||||
|
it.uppercase()
|
||||||
|
}
|
||||||
|
|
||||||
private fun removeQueryParamsForExtensionComparison(fullUrl: String): String {
|
private fun removeQueryParamsForExtensionComparison(fullUrl: String): String {
|
||||||
return if (fullUrl.contains("?")) {
|
return if (fullUrl.contains("?")) {
|
||||||
@@ -344,9 +377,18 @@ class RichTextParser() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fun startsWithNIP19Scheme(word: String): Boolean {
|
fun startsWithNIP19Scheme(word: String): Boolean {
|
||||||
val cleaned = word.lowercase().removePrefix("@").removePrefix("nostr:").removePrefix("@")
|
if (word.isEmpty()) return false
|
||||||
|
return if (word[0] == 'n' || word[0] == 'N') {
|
||||||
return acceptedNIP19schemes.any { cleaned.startsWith(it) }
|
if (word.startsWith("nostr:n") || word.startsWith("NOSTR:N")) {
|
||||||
|
acceptedNIP19schemes.any { word.startsWith(it, 6) }
|
||||||
|
} else {
|
||||||
|
acceptedNIP19schemes.any { word.startsWith(it) }
|
||||||
|
}
|
||||||
|
} else if (word[0] == '@') {
|
||||||
|
acceptedNIP19schemes.any { word.startsWith(it, 1) }
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fun isUrlWithoutScheme(url: String) = noProtocolUrlValidator.matcher(url).matches()
|
fun isUrlWithoutScheme(url: String) = noProtocolUrlValidator.matcher(url).matches()
|
||||||
|
|||||||
Reference in New Issue
Block a user