From e650b145dfd161e60c5271673b9ba5d2efa47c54 Mon Sep 17 00:00:00 2001 From: Andrzej Rybczak Date: Sun, 13 Nov 2016 07:22:06 +0100 Subject: [PATCH] Further improve HTML formatting --- src/lyrics_fetcher.cpp | 30 +----------------------------- src/lyrics_fetcher.h | 5 ----- src/utility/html.cpp | 21 +++++++++++++++++---- 3 files changed, 18 insertions(+), 38 deletions(-) diff --git a/src/lyrics_fetcher.cpp b/src/lyrics_fetcher.cpp index 0ae8b936..3ce3f504 100644 --- a/src/lyrics_fetcher.cpp +++ b/src/lyrics_fetcher.cpp @@ -106,6 +106,7 @@ std::vector LyricsFetcher::getContent(const char *regex_, const std void LyricsFetcher::postProcess(std::string &data) const { + data = unescapeHtmlUtf8(data); stripHtmlTags(data); // Remove indentation from each line and collapse multiple newlines into one. std::vector lines; @@ -157,7 +158,6 @@ LyricsFetcher::Result LyricwikiFetcher::fetch(const std::string &artist, const s data.clear(); for (auto it = lyrics.begin(); it != lyrics.end(); ++it) { - boost::replace_all(*it, "
", "\n"); stripHtmlTags(*it); boost::trim(*it); if (!it->empty()) @@ -226,34 +226,6 @@ bool GoogleLyricsFetcher::isURLOk(const std::string &url) /**********************************************************************/ -void Sing365Fetcher::postProcess(std::string &data) const -{ - // throw away ad - data = boost::regex_replace(data, boost::regex(""), ""); - LyricsFetcher::postProcess(data); -} - -/**********************************************************************/ - -void JustSomeLyricsFetcher::postProcess(std::string &data) const -{ - data = unescapeHtmlUtf8(data); - LyricsFetcher::postProcess(data); -} - -/**********************************************************************/ - -void MetrolyricsFetcher::postProcess(std::string &data) const -{ - // some of lyrics have both \n chars and
, html tags - // are always present whereas \n chars are not, so we need to - // throw them away to avoid having line breaks doubled. - boost::replace_all(data, " ", ""); - boost::replace_all(data, "
", "\n"); - data = unescapeHtmlUtf8(data); - LyricsFetcher::postProcess(data); -} - bool MetrolyricsFetcher::isURLOk(const std::string &url) { // it sometimes return link to sitemap.xml, which is huge so we need to discard it diff --git a/src/lyrics_fetcher.h b/src/lyrics_fetcher.h index 5222381b..9cafbfb5 100644 --- a/src/lyrics_fetcher.h +++ b/src/lyrics_fetcher.h @@ -82,7 +82,6 @@ protected: virtual const char *regex() const OVERRIDE { return "
(.*?)
"; } virtual bool isURLOk(const std::string &url) OVERRIDE; - virtual void postProcess(std::string &data) const OVERRIDE; }; struct LyricsmaniaFetcher : public GoogleLyricsFetcher @@ -99,8 +98,6 @@ struct Sing365Fetcher : public GoogleLyricsFetcher protected: virtual const char *regex() const OVERRIDE { return "(.*?)"; } - - virtual void postProcess(std::string &data) const OVERRIDE; }; struct JustSomeLyricsFetcher : public GoogleLyricsFetcher @@ -109,8 +106,6 @@ struct JustSomeLyricsFetcher : public GoogleLyricsFetcher protected: virtual const char *regex() const OVERRIDE { return "
\\s*
(.*?) #include #include "utility/html.h" @@ -58,20 +59,32 @@ void unescapeHtmlEntities(std::string &s) boost::replace_all(s, "<", "<"); boost::replace_all(s, " ", " "); boost::replace_all(s, """, "\""); + boost::replace_all(s, "–", "–"); + boost::replace_all(s, "—", "—"); } void stripHtmlTags(std::string &s) { - bool is_p, is_slash_p; + // Erase newlines so they don't duplicate with HTML ones. + s.erase(std::remove_if(s.begin(), s.end(), [](char c) { + return c == '\n' || c == '\r'; + }), s.end()); + + bool is_newline; for (size_t i = s.find("<"); i != std::string::npos; i = s.find("<")) { size_t j = s.find(">", i); if (j != std::string::npos) { ++j; - is_p = s.compare(i, j-i, "

") == 0; - is_slash_p = s.compare(i, j-i, "

") == 0; - if (is_p || is_slash_p) + is_newline + = s.compare(i, std::min(3, j-i), "

") == 0 + || s.compare(i, j-i, "

") == 0 + || s.compare(i, j-i, "
") == 0 + || s.compare(i, j-i, "
") == 0 + || s.compare(i, std::min(4, j-i), "